diff --git a/.gitignore b/.gitignore
index 801790d0a472080af607e9fbcde0284902a4ead8..664c45b7202f6bf93712062ffa1d003b575afffd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,12 +52,12 @@ tools/__pycache__
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/pd/ir/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
 tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd_ops_info.h
+paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 41b90345c8c5f38afa413bd2411af975c9d0b103..d3f330ba9dd0fa58b26e9ea05a7154184747daff 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,7 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_GIT_TAG release/v0.1)
+set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index 9f6fd32ad986c4a5911b1d00dfb548fa3320c34d..5c48afa2806aab10bb08317679c0a00c8f177f7b 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -99,7 +99,8 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  set(LLVM_TARGET_DEPENDS  ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
+  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
   add_public_tablegen_target(MLIR${td_base}IncGen)
   add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 661c3675c84b27a7ed8210fec0cfeaa2c858487c..ba6f0396008fc25dd21d462a2d19285a6cbe9080 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS
       -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
       -DWITH_STATIC=OFF
       -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
       -DCMAKE_POSITION_INDEPENDENT_CODE=ON
       -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
       ${EXTERNAL_OPTIONAL_ARGS}
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index f88c993d85e2fa6eda27b7e845ee27f08347fa83..49ba9479d49e93143665b8314d04ee8e0efcbf51 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,8 +1,9 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper)
+
 if (WITH_DISTRIBUTE)
   cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
 endif()
-cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 5dc43af117825bf95407255e93e1e4600e8ddd9a..cb82677a281e990d9837f081b0d4d2f3b0a34a26 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
                         "Only CPU place is supported for ProcessGroupGloo."));
 }
 
-ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
-                                   int rank, int world_size,
-                                   const std::shared_ptr<GlooOptions> options)
-    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+    int world_size, const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) {
   _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
   auto prefix_store =
       ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 24f156571a427128f09cd28e632212f47fa4cd47..71e0a40f8a76181d9f4db13ddd57b31de676910b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
   class GlooStore : public ::gloo::rendezvous::Store {
    public:
-    explicit GlooStore(
-        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+    explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
         : _store(store) {}
 
     ~GlooStore() = default;
@@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup {
     }
 
    protected:
-    std::shared_ptr<paddle::distributed::TCPStore> _store;
+    std::shared_ptr<paddle::distributed::Store> _store;
   };
 
   class GlooOptions {
@@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup {
     std::shared_ptr<::gloo::transport::Device> device;
   };
 
-  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
-                            int world_size,
-                            std::shared_ptr<GlooOptions> options);
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+      int world_size, std::shared_ptr<GlooOptions> options);
 
   ~ProcessGroupGloo() = default;
 
@@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup {
  protected:
   uint32_t _tag;
   std::shared_ptr<gloo::rendezvous::Context> _context;
-  std::shared_ptr<GlooStore> _store;
+  std::shared_ptr<::gloo::rendezvous::Store> _store;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 84f5ca48d25c84b3ba29dbff43952fbf08b22cb9..2deeb7ca03003d0b6c8fa0948afa0a3394639f8b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -139,11 +139,9 @@ bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
 // TODO(sandyhouse): Add timeout for wait, now timeout unused
 bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
   SynchronizeStreams();
-  if (FLAGS_hccl_blocking_wait) {
-    // NOTE(sandyhouse): It will block host for sync
-    while (!IsCompleted()) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
-    }
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
   }
   return true;
 }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index f2376b4eed7600f67d6e4564b44920cbe3936f76..83d509be2cdd7b79faf4e2a2f510c34361b94157 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -84,29 +84,6 @@ class ProcessGroupHCCL : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Task> Barrier(
-      const BarrierOptions& = BarrierOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
-                                           int dst_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
-                                           int src_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllGather(
-      std::vector<Tensor>& in_tensors,
-      std::vector<Tensor>& out_tensors) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllToAll(
-      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
-
-  std::shared_ptr<ProcessGroup::Task> Reduce(
-      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
-
-  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
-                                              std::vector<Tensor>& out_tensors,
-                                              const ScatterOptions&) override;
-
  protected:
   virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 67715f410d443c38a1c5d92c560a35a909c5ec1c..7f21bcee87ab705097d3c2beaf799e5f2d93b833 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -88,8 +88,8 @@ void SyncDefaultStream(
   for (size_t i = 0; i < places.size(); ++i) {
     auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(places[i]));
-    ncclEvents[i].Record(*dev_ctx[i]);
-    ncclEvents[i].Block(*default_ctx);
+    ncclEvents[i].Record(*default_ctx);
+    ncclEvents[i].Block(*dev_ctx[i]);
   }
 }
 
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 59f3ea3b0a7d85651e7780b4b11875f19b70931e..be4c5423943f5076201b75e307094c75d3d9c103 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -13,11 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/reducer.h"
-#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace distributed {
 
+static Backend TransToBackend(platform::Place place) {
+  static const std::map<phi::AllocationType, Backend> type_backend = {
+      {phi::AllocationType::GPU, Backend::GPU},
+      {phi::AllocationType::CPU, Backend::CPU},
+  };
+
+  phi::AllocationType type = place.GetType();
+  auto it = type_backend.find(type);
+  PADDLE_ENFORCE_EQ(it != type_backend.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Place type (%s) is not supported. ", place));
+  return it->second;
+}
+
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
     const std::vector<Tensor> tensors,
     const std::vector<bool> &is_sparse_gradient,
@@ -127,5 +140,663 @@ std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
   return res;
 }
 
+template <typename DeviceContext, typename T>
+static void ConcatTensorsForAllReduce(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents) {
+  operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
+  concat_functor_(
+      context, dense_tensors_, 0,
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get());
+}
+
+template <typename DeviceContext, typename T>
+static void SplitTensorsForAllReduce(
+    const DeviceContext &context, Tensor *p_dense_contents,
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
+  auto *in =
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get();
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
+
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+
+  operators::math::SplitFunctor<DeviceContext, T> split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+
+// context is used to select the stream for concat
+template <typename DeviceContext>
+static void ConcatTensorsWithType(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents, phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      ConcatTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatTensorsForAllReduce<DeviceContext, float>(context, dense_tensors_,
+                                                      p_dense_contents);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatTensorsForAllReduce<DeviceContext, double>(context, dense_tensors_,
+                                                       p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+// context is used to select the stream for split
+template <typename DeviceContext>
+static void SplitTensorsWithType(const DeviceContext &context,
+                                 Tensor *p_dense_contents,
+                                 std::vector<phi::DenseTensor> *p_dense_tensors,
+                                 phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      SplitTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitTensorsForAllReduce<DeviceContext, float>(context, p_dense_contents,
+                                                     p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitTensorsForAllReduce<DeviceContext, double>(context, p_dense_contents,
+                                                      p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+void EagerGroup::ConcatTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat grad tensor not supported on place (%s)", place));
+  }
+}
+
+void EagerGroup::SplitTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split grad tensor since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split grad tensor not supported on place (%s)", place));
+  }
+}
+
+EagerReducer::EagerReducer(
+    const std::vector<Tensor> tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters)
+    : tensors_(tensors),
+      group_indices_(group_indices),
+      is_sparse_gradient_(is_sparse_gradient),
+      process_group_(process_group),
+      group_size_limits_(group_size_limits),
+      find_unused_vars_each_step_(find_unused_parameters) {
+  VLOG(3) << "Start construct the Reducer ...";
+
+  nranks_ = process_group_->GetSize();
+
+  // initialize groups
+  InitializeGroups(group_indices);
+
+  for (size_t global_var_index = 0; global_var_index < tensors_.size();
+       ++global_var_index) {
+    auto tensor = tensors_[global_var_index];
+    auto reduce_hook = [=](void) -> void {
+      this->AddDistHook(global_var_index);
+    };
+
+    const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    const auto &accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    accumulation_grad_node->RegisterReduceHook(
+        std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
+
+    gradnode_index_map_[grad_node.get()] = global_var_index;
+  }
+
+  vars_marked_ready_.resize(tensors_.size(), false);
+  local_used_vars_.resize(tensors_.size(), 0);
+
+  if (find_unused_vars_each_step_) {
+    global_used_vars_ = paddle::experimental::empty(
+        ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
+        TransToBackend(inner_place_));
+  }
+}
+
+std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
+    Tensor *tensor) {
+  auto *autograd_meta = tensor->get_autograd_meta();
+  const auto &grad_node =
+      static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
+  return grad_node;
+}
+
+void EagerReducer::InitializeGroups(
+    const std::vector<std::vector<size_t>> &group_indices) {
+  VLOG(3) << "Start initialize groups ..";
+
+  // clear the group
+  groups_.clear();
+  groups_.reserve(group_indices.size());
+
+  variable_locators_.clear();
+  variable_locators_.resize(tensors_.size());
+
+  auto group_nums = group_indices.size();
+  for (size_t group_index = 0; group_index < group_nums; ++group_index) {
+    const auto &tensor_indices_ = group_indices[group_index];
+    PADDLE_ENFORCE_GT(
+        tensor_indices_.size(), 0,
+        platform::errors::PreconditionNotMet(
+            "The number of group[%d]'s elements is 0.", group_index));
+
+    EagerGroup group;
+
+    // It's just for check the sparse or dense
+    auto first_var = tensors_[tensor_indices_.front()];
+    if (tensor_indices_.size() == 1 &&
+        is_sparse_gradient_[tensor_indices_.front()]) {
+      // process the sparse gradient. one sparse, one group
+      group.dtype_ = first_var.dtype();
+    } else {
+      // process the dense gradient.
+      InitializeDenseGroups(tensor_indices_, &group);
+      // experimental::Backend backend =  TransToBackend(inner_place_);
+      group.dense_contents_ = paddle::experimental::empty(
+          ScalarArray({group.all_length_}), group.dtype_,
+          TransToBackend(inner_place_));
+    }
+
+    // map tensors to this group by VariableLocator
+    size_t inside_group_index = 0;
+    for (const auto var_index : tensor_indices_) {
+      TensorLocator tensor_locator;
+      tensor_locator.group_index = group_index;
+      tensor_locator.inside_group_index = inside_group_index++;
+      variable_locators_[var_index] = tensor_locator;
+    }
+    group.tensor_indices_ = std::move(tensor_indices_);
+    groups_.emplace_back(std::move(group));
+
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
+  }
+}
+
+void EagerReducer::InitializeDenseGroups(
+    const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
+  VLOG(3) << "InitializeDenseGroups.";
+  int64_t all_length = 0;
+  for (size_t index = 0; index < tensor_indices_.size(); ++index) {
+    auto tensor_index = tensor_indices_[index];
+    auto &tensor = tensors_[tensor_index];
+    auto &tensor_name = tensor.name();
+
+    PADDLE_ENFORCE_EQ(tensor.is_initialized(), true,
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s is not initialized.", tensor_name));
+    const auto size = tensor.numel();
+    PADDLE_ENFORCE_GT(
+        size, 0, platform::errors::PreconditionNotMet(
+                     "The number of tensor %s's elements is 0.", tensor_name));
+    all_length += size;
+
+    p_group->length_.push_back(size);
+
+    // for concat operator
+    p_group->origin_shapes_.push_back(ScalarArray(tensor.shape()));
+    p_group->dense_tensors_.push_back(phi::DenseTensor());
+
+    const auto &dtype = tensor.dtype();
+    const auto &place = tensor.place();
+    const auto &inner_place = tensor.impl()->place();
+    if (index > 0) {
+      PADDLE_ENFORCE_EQ(dtype, p_group->dtype_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has unexpected dtype.", tensor_name));
+      PADDLE_ENFORCE_EQ(place, place_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has different place. Expected place is "
+                            "%s, but actual place is %s",
+                            tensor_name, inner_place_, inner_place));
+    } else {
+      p_group->dtype_ = dtype;
+      place_ = place;
+      inner_place_ = inner_place;
+    }
+  }
+  p_group->all_length_ = all_length;
+}
+
+void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
+  std::queue<egr::GradNodeBase *> queue;
+  std::set<egr::GradNodeBase *> visited;
+
+  for (const auto &output : outputs) {
+    auto *auto_grad_meta =
+        static_cast<egr::AutogradMeta *>(output.get_autograd_meta());
+    if (!auto_grad_meta) continue;
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      continue;
+    }
+    egr::GradNodeBase *grad_node = shared_grad_node.get();
+    queue.emplace(grad_node);
+  }
+
+  while (!queue.empty()) {
+    egr::GradNodeBase *node = queue.front();
+    queue.pop();
+    const std::vector<std::vector<egr::Edge>> &edges = node->GetEdges();
+    for (size_t i = 0; i < edges.size(); i++) {
+      for (size_t j = 0; j < edges[i].size(); j++) {
+        const egr::Edge &edge = edges[i][j];
+        auto next_node_shared = edge.GetMutableGradNode();
+        if (!next_node_shared || !next_node_shared.get()) {
+          continue;
+        }
+        auto *next_node = next_node_shared.get();
+        const bool was_inserted = visited.insert(next_node).second;
+        if (was_inserted) {
+          queue.emplace(next_node);
+        }
+      }
+    }
+  }
+
+  for (const auto &it : gradnode_index_map_) {
+    if (visited.count(it.first) == 0) {
+      unused_vars_.push_back(it.second);
+      VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+              << "Tensor " << tensors_[it.second].name() << " at index "
+              << it.second << " is marked as unused.";
+    }
+  }
+}
+
+void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  grad_need_hooks_ = true;
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
+    group.pending_ = group.tensor_indices_.size();
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(tensors_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+
+  if (unused_vars_.size() == tensors_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
+}
+
+void EagerReducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
+  // gradient synchronization is not required when grad_need_hooks_ is false.
+  if (!grad_need_hooks_) {
+    return;
+  }
+
+  auto &tensor = tensors_[var_index];
+  const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+  VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name()
+          << "@Grad] arrived and triggered disthook";
+
+  local_used_vars_[var_index] = 1;
+
+  if (!has_marked_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
+  MarkVarReady(var_index, true);
+}
+
+void EagerReducer::MarkVarReady(const size_t var_index,
+                                const bool is_used_var) {
+  VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name()
+          << "] is marked ready.";
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, tensors_[var_index].name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+  groups_need_finalize_ = true;
+
+  const auto &var_locator = variable_locators_[var_index];
+  const auto group_index = var_locator.group_index;
+  const auto inside_group_index = var_locator.inside_group_index;
+
+  auto &group = groups_[group_index];
+  auto &group_tensor = group.dense_tensors_[inside_group_index];
+  const auto length = group.length_[inside_group_index];
+
+  if (is_used_var) {
+    auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+    auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+    group_tensor
+        .ShareDataWith(
+            *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
+        .Resize({grad_tensor.numel()});
+  } else {
+    // TODO(shenliang03): maybe save the memory by avoiding tensor construction
+    if (!group_tensor.initialized()) {
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      group_tensor.mutable_data(inner_place_, group.dtype_);
+    }
+    if (HasGrad(var_index)) {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad";
+      auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+      group_tensor
+          .ShareDataWith(*(
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor->impl())))
+          .Resize({length});
+    } else {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name()
+              << "] doesn't have grad";
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
+    }
+  }
+
+  if (--group.pending_ == 0) {
+    // can start allreduce
+    MarkGroupReady(group_index);
+  }
+
+  if (next_group_ == groups_.size()) {
+    FinalizeBackward();
+  }
+}
+
+void EagerReducer::MarkGroupReady(size_t group_index) {
+  VLOG(3) << "Group[" << group_index << "] is ready";
+
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
+  if (group_index > next_group_) {
+    VLOG(3) << "It will adjust the order of group in next batch automatically";
+    return;
+  }
+
+  for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
+       ++next_group_) {
+    UNUSED auto &group = groups_[next_group_];
+    FusedAllReduceSchedule(&group, next_group_);
+  }
+}
+
+bool EagerReducer::HasGrad(size_t var_index) {
+  auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+  if (grad && grad->is_initialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void EagerReducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  const auto *dev_ctx =
+      platform::DeviceContextPool::Instance().Get(inner_place_);
+  auto *global_used_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(global_used_vars_.impl())
+          .get();
+  framework::TensorFromVector<int32_t>(local_used_vars_, *dev_ctx,
+                                       global_used_tensor);
+
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  std::vector<Tensor> reduce_tensors = {global_used_vars_};
+  process_group_->AllReduce(reduce_tensors, opts)->Synchronize();
+
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+  dev_ctx->Wait();
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+            << "Var [" << var_index << "] [" << tensors_[var_index].name()
+            << "] global_unused: " << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank "
+              << process_group_->GetRank() << "]";
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      auto &src_tensor = group.dense_tensors_[inside_group_index];
+
+      Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
+
+      auto dest_var_base = tensors_[var_index];
+      auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base);
+      grad_tensor->copy_(grad_value, inner_place_, true);
+      grad_tensor->reshape(dest_var_base.shape());
+    }
+  }
+}
+
+void EagerReducer::FinalizeBackward() {
+  groups_need_finalize_ = false;
+  grad_need_hooks_ = false;
+  for (auto &group : groups_) {
+    group.task->Synchronize();
+  }
+
+  for (auto &group : groups_) {
+    group.SplitTensors(inner_place_);
+  }
+
+  if (find_unused_vars_each_step_) {
+    ProcessUnusedDenseVars();
+    local_used_vars_.clear();
+    local_used_vars_.resize(tensors_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
+}
+
+void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
+                                          const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+
+  VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
+
+  // concat tensors
+  group->ConcatTensors(inner_place_);
+
+  // div nranks
+  paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0,
+                               false);
+
+  // all_reduce
+  std::vector<Tensor> reduce_tensors = {group->dense_contents_};
+  group->task = process_group_->AllReduce(reduce_tensors, opts);
+
+  // split in FinalizeBackward()
+}
+
+std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
+  const auto &tensors_ = group.tensor_indices_;
+  out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
+      << "\n";
+  auto begin = tensors_.begin();
+  auto end = tensors_.end();
+  out << "[";
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0) out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+  out << "]\n";
+  return out;
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index f8c75385ef8bd6891df8eda6faa93c73091c37f5..d3ffa8498a14b0d0ade02ea459e1c6058550122f 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -17,16 +17,126 @@
 #include <map>
 #include <vector>
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
 namespace distributed {
 using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
+using ScalarArray =
+    paddle::experimental::ScalarArrayBase<paddle::experimental::Tensor>;
+using Backend = paddle::experimental::Backend;
 
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
-    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
-    const std::vector<size_t>& group_size_limits,
-    const std::vector<int64_t>& tensor_indices = {});
+    const std::vector<Tensor>, const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices = {});
+
+class EagerGroup {
+ public:
+  Tensor dense_contents_;
+
+  // for concat kernel
+  std::vector<phi::DenseTensor> dense_tensors_;
+  std::vector<int64_t> length_;
+  int64_t all_length_{0};
+  std::vector<ScalarArray> origin_shapes_;
+
+  // Global indices of participating tensors in the group
+  std::vector<size_t> tensor_indices_;
+
+  // Number of params that haven't been ready. When it is 0, it means
+  // the group is ready.
+  size_t pending_ = -1;
+
+  // external message of group
+  phi::DataType dtype_;
+
+  // help to sync
+  std::shared_ptr<ProcessGroup::Task> task;
+
+  // context is used to select the stream for concat
+  void ConcatTensors(const platform::Place &);
+
+  // context is used to select the stream for split
+  void SplitTensors(const platform::Place &);
+
+  friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
+};
+
+struct TensorLocator {
+  // record the index in groups_
+  size_t group_index;
+  size_t inside_group_index;
+};
+
+class EagerReducer {
+ public:
+  explicit EagerReducer(
+      const std::vector<Tensor> tensors,
+      const std::vector<std::vector<size_t>> &group_indices,
+      const std::vector<bool> &is_sparse_gradient,
+      std::shared_ptr<distributed::ProcessGroup> process_group,
+      const std::vector<size_t> &group_size_limits,
+      bool find_unused_parameters);
+
+  virtual ~EagerReducer() {}
+
+  std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
+
+  void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
+  void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
+                             EagerGroup *p_group);
+  void PrepareForBackward(const std::vector<Tensor> &outputs);
+  void AddDistHook(size_t var_index);
+  void MarkVarReady(const size_t var_index, const bool is_used_var);
+  void MarkGroupReady(const size_t group_index);
+  void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
+  void FinalizeBackward();
+  void TraverseBackwardGraph(const std::vector<Tensor> &outputs);
+  void ProcessUnusedDenseVars();
+  bool HasGrad(size_t var_index);
+
+ private:
+  std::vector<Tensor> tensors_;
+  std::vector<std::vector<size_t>> group_indices_;
+  std::vector<bool> is_sparse_gradient_;
+  std::shared_ptr<distributed::ProcessGroup> process_group_;
+  std::vector<size_t> group_size_limits_;
+
+  std::vector<EagerGroup> groups_;
+  std::vector<TensorLocator> variable_locators_;
+  PlaceType place_;
+  platform::Place inner_place_;
+  size_t next_group_ = 0;
+  int64_t nranks_ = -1;
+
+  bool grad_need_hooks_{false};
+
+  std::vector<bool> vars_marked_ready_;
+  std::vector<int32_t> local_used_vars_;
+
+  // Following variables are to help unused vars
+  std::vector<size_t> unused_vars_;
+  std::map<egr::GradNodeBase *, size_t> gradnode_index_map_;
+  bool has_marked_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
+  bool groups_need_finalize_{false};
+  Tensor global_used_vars_;
+};
 
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 3e734b1b9ed241f54e14d8a7c94b834674db1054..8641b36a1be8ea51dc4ad911214c2cebe6121e20 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON)
 endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND WITH_PSCORE)
   set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
   set(BRPC_DEPS "")
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 8d2ec5c41d86499393f62c65c4519960669b8fd8..80a6b4667aa1a0dbfd957a390c9202ea1a4d2b68 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   server_.Stop(1000);
   server_.Join();
 #endif
@@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank,
       IsInit(), true,
       platform::errors::PreconditionNotMet(
           "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   int retry_time = 0;  // message bus will retry sending for 10 times
   while (retry_time < 10) {
     ++retry_time;
@@ -173,8 +171,7 @@ void MessageBus::ListenPort() {
     LOG(INFO) << "No need listen to port since training on single card.";
     return;
   }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // function keep listen the port and handle the message
   PADDLE_ENFORCE_EQ(
       server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
@@ -203,8 +200,7 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                                const InterceptorMessage& interceptor_message) {
   const auto& dst_addr = GetAddr(dst_rank);
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index d805ac81606b8928b069174bf8aadc693db2aa0c..dfd65fdbc00d445a11f60f4e1cde4f4da77b80dc 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,8 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -64,8 +63,7 @@ class MessageBus final {
 
   const std::string& GetAddr(int64_t rank) const;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // send the message inter rank (dst is different rank with src)
   bool SendInterRank(int64_t dst_rank,
                      const InterceptorMessage& interceptor_message);
@@ -81,8 +79,7 @@ class MessageBus final {
   // the ip needs to be listened
   std::string addr_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index c3fff98f684ad5f0feb74f30fd51404d4693c7f9..1c66d83ea34d702733b3a5c0386abb62d4e1ec8a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
index 02f73471e3b911adc622ca990bca70b7a5f3033d..5ab687ff93dc4fc2ccd0884456cdbf2d6c3c6fcb 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #pragma once
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 0ae87812bce434be5e664aefea4bba19ae147d28..fac30e26c388c65af13135699a886a3c69031d57 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
   optional CommonAccessorParameter common = 6;
   optional TableType type = 7;
   optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }
 
 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
   optional double ada_epsilon = 5 [ default = 1e-08 ];
   repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = true ];
+  optional float cache_ratio = 6 [ default = 0.3 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index 301708f6b7bb3d465d8dcbd2b94bbc4c217fcc77..a3db88e3b679da63a9b205cc013d579cf9a4be2f 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -44,7 +44,7 @@ void GraphPsService_Stub::service(
   }
 }
 
-int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+int GraphBrpcClient::get_server_index_by_id(int64_t id) {
   int shard_num = get_shard_num();
   int shard_per_server = shard_num % server_size == 0
                              ? shard_num / server_size
@@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
 }
 
 std::future<int32_t> GraphBrpcClient::get_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     std::vector<std::vector<std::string>> &res) {
   std::vector<int> request2server;
@@ -66,7 +66,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -129,7 +129,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
@@ -179,9 +179,9 @@ std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::add_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    uint32_t table_id, std::vector<int64_t> &node_id_list,
     std::vector<bool> &is_weighted_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<std::vector<bool>> is_weighted_bucket;
   bool add_weight = is_weighted_list.size() > 0;
   std::vector<int> server_index_arr;
@@ -191,7 +191,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
       if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
     }
     request_bucket[index_mapping[server_index]].push_back(
@@ -229,7 +229,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     size_t node_num = request_bucket[request_idx].size();
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     if (add_weight) {
       bool weighted[is_weighted_bucket[request_idx].size() + 1];
       for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
@@ -248,8 +248,8 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::remove_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+    uint32_t table_id, std::vector<int64_t> &node_id_list) {
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<int> server_index_arr;
   std::vector<int> index_mapping(server_size, -1);
   for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
@@ -257,7 +257,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
     }
     request_bucket[index_mapping[server_index]].push_back(
         node_id_list[query_idx]);
@@ -291,7 +291,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     // PsService_Stub rpc_stub(get_cmd_channel(server_index));
     GraphPsService_Stub rpc_stub =
         getServiceStub(get_cmd_channel(server_index));
@@ -303,9 +303,9 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 }
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
-    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-    // std::vector<std::vector<std::pair<uint64_t, float>>> &res,
-    std::vector<std::vector<uint64_t>> &res,
+    uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+    // std::vector<std::vector<std::pair<int64_t, float>>> &res,
+    std::vector<std::vector<int64_t>> &res,
     std::vector<std::vector<float>> &res_weight, bool need_weight,
     int server_index) {
   if (server_index != -1) {
@@ -337,7 +337,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
           int start = 0;
           while (start < actual_size) {
             res[node_idx].emplace_back(
-                *(uint64_t *)(node_buffer + offset + start));
+                *(int64_t *)(node_buffer + offset + start));
             start += GraphNode::id_size;
             if (need_weight) {
               res_weight[node_idx].emplace_back(
@@ -358,7 +358,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
     closure->request(0)->set_table_id(table_id);
     closure->request(0)->set_client_id(_client_id);
     closure->request(0)->add_params((char *)node_ids.data(),
-                                    sizeof(uint64_t) * node_ids.size());
+                                    sizeof(int64_t) * node_ids.size());
     closure->request(0)->add_params((char *)&sample_size, sizeof(int));
     closure->request(0)->add_params((char *)&need_weight, sizeof(bool));
     ;
@@ -380,14 +380,14 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
       server2request[server_index] = request2server.size();
       request2server.push_back(server_index);
     }
-    // res.push_back(std::vector<std::pair<uint64_t, float>>());
+    // res.push_back(std::vector<std::pair<int64_t, float>>());
     res.push_back({});
     if (need_weight) {
       res_weight.push_back({});
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -428,7 +428,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
               int start = 0;
               while (start < actual_size) {
                 res[query_idx].emplace_back(
-                    *(uint64_t *)(node_buffer + offset + start));
+                    *(int64_t *)(node_buffer + offset + start));
                 start += GraphNode::id_size;
                 if (need_weight) {
                   res_weight[query_idx].emplace_back(
@@ -459,7 +459,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -476,7 +476,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 }
 std::future<int32_t> GraphBrpcClient::random_sample_nodes(
     uint32_t table_id, int server_index, int sample_size,
-    std::vector<uint64_t> &ids) {
+    std::vector<int64_t> &ids) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
     int ret = 0;
     auto *closure = (DownpourBrpcClosure *)done;
@@ -490,7 +490,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
-        ids.push_back(*(uint64_t *)(buffer + index));
+        ids.push_back(*(int64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
       delete[] buffer;
@@ -633,7 +633,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
 }
 
 std::future<int32_t> GraphBrpcClient::set_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &features) {
   std::vector<int> request2server;
@@ -646,7 +646,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
       request_call_num);
@@ -696,7 +696,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 06e753d028baa2d9c0002620dc445d4204046180..e2b8a518615dc511a726c4be104cb03900dd2e9a 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual ~GraphBrpcClient() {}
   // given a batch of nodes, sample graph_neighbors for each of them
   virtual std::future<int32_t> batch_sample_neighbors(
-      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-      std::vector<std::vector<uint64_t>>& res,
+      uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+      std::vector<std::vector<int64_t>>& res,
       std::vector<std::vector<float>>& res_weight, bool need_weight,
       int server_index = -1);
 
@@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
                                                    int server_index,
                                                    int sample_size,
-                                                   std::vector<uint64_t>& ids);
+                                                   std::vector<int64_t>& ids);
   virtual std::future<int32_t> get_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
 
   virtual std::future<int32_t> set_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       const std::vector<std::vector<std::string>>& features);
 
   virtual std::future<int32_t> clear_nodes(uint32_t table_id);
   virtual std::future<int32_t> add_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      uint32_t table_id, std::vector<int64_t>& node_id_list,
       std::vector<bool>& is_weighted_list);
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
@@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
                                                        std::string path);
   virtual std::future<int32_t> remove_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list);
+      uint32_t table_id, std::vector<int64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
-  int get_server_index_by_id(uint64_t id);
+  int get_server_index_by_id(int64_t id);
   void set_local_channel(int index) {
     this->local_channel = get_cmd_channel(index);
   }
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 441f489fb3097cda51fc62dc35e93264a1f7caef..20a55e4d11983dad37b9e2e7845923dded881d3b 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
     return 0;
   }
 
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
   std::vector<bool> is_weighted_list;
   if (request.params_size() == 2) {
     size_t weight_list_size = request.params(1).size() / sizeof(bool);
@@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
         "graph_get_node_feat request requires at least 1 argument");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   ((GraphTable *)table)->remove_graph_node(node_ids);
   return 0;
@@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
         "graph_random_sample_neighbors request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
   bool need_weight = *(bool *)(request.params(2).c_str());
   std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
@@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
 int32_t GraphBrpcService::graph_random_sample_nodes(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
-  size_t size = *(uint64_t *)(request.params(0).c_str());
+  size_t size = *(int64_t *)(request.params(0).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
   if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
@@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
         "graph_get_node_feat request requires at least 2 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
@@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
                       "at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t),
+  size_t node_num = request.params(0).size() / sizeof(int64_t),
          size_of_size_t = sizeof(size_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
-  bool need_weight = *(uint64_t *)(request.params(2).c_str());
-  // std::vector<uint64_t> res = ((GraphTable
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
+  bool need_weight = *(int64_t *)(request.params(2).c_str());
+  // std::vector<int64_t> res = ((GraphTable
   // *)table).filter_out_non_exist_nodes(node_data, sample_size);
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  std::vector<uint64_t> local_id;
+  std::vector<int64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = get_rank();
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
@@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
         "graph_set_node_feat request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 088edcb75bbc67d6d2acef9609b442f6fa38c332..c8be0f797109078509eeced53920845ac4c51684 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
-void add_graph_node(std::vector<uint64_t> node_ids,
+void add_graph_node(std::vector<int64_t> node_ids,
                     std::vector<bool> weight_list) {}
-void remove_graph_node(std::vector<uint64_t> node_ids) {}
+void remove_graph_node(std::vector<int64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) {
 }
 
 void GraphPyClient::add_graph_node(std::string name,
-                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<int64_t>& node_ids,
                                    std::vector<bool>& weight_list) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name,
 }
 
 void GraphPyClient::remove_graph_node(std::string name,
-                                      std::vector<uint64_t>& node_ids) {
+                                      std::vector<int64_t>& node_ids) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
@@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   }
 }
 
-std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
+std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
 GraphPyClient::batch_sample_neighbors(std::string name,
-                                      std::vector<uint64_t> node_ids,
+                                      std::vector<int64_t> node_ids,
                                       int sample_size, bool return_weight,
                                       bool return_edges) {
-  // std::vector<std::vector<std::pair<uint64_t, float>>> v;
-  std::vector<std::vector<uint64_t>> v;
+  std::vector<std::vector<int64_t>> v;
   std::vector<std::vector<float>> v1;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name,
   // res.first[1]: slice index
   // res.first[2]: src nodes
   // res.second: edges weight
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res.first.push_back({});
   res.first.push_back({});
   if (return_edges) res.first.push_back({});
@@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name,
     status.wait();
   }
 }
-std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
-                                                         int server_index,
-                                                         int sample_size) {
-  std::vector<uint64_t> v;
+std::vector<int64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                        int server_index,
+                                                        int sample_size) {
+  std::vector<int64_t> v;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status =
@@ -357,7 +356,7 @@ std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
 
 // (name, dtype, ndarray)
 std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names) {
   std::vector<std::vector<std::string>> v(
       feature_names.size(), std::vector<std::string>(node_ids.size()));
@@ -371,7 +370,7 @@ std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
 }
 
 void GraphPyClient::set_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names,
     const std::vector<std::vector<std::string>> features) {
   if (this->table_id_map.count(node_type)) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index c25ef5035453ded0996cfe190dec71b0ce4b9b4a..85707137c1800ed9486148584ce22a78c52a47fd 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -70,18 +70,34 @@ class GraphPyService {
     ::paddle::distributed::TableAccessorParameter* accessor_proto =
         sparse_table_proto->mutable_accessor();
 
-    ::paddle::distributed::CommonAccessorParameter* common_proto =
-        sparse_table_proto->mutable_common();
+    // ::paddle::distributed::CommonAccessorParameter* common_proto =
+    //     sparse_table_proto->mutable_common();
 
+    ::paddle::distributed::GraphParameter* graph_proto =
+        sparse_table_proto->mutable_graph_parameter();
+
+    ::paddle::distributed::GraphFeature* graph_feature =
+        graph_proto->mutable_graph_feature();
+
+    graph_proto->set_task_pool_size(24);
+
+    graph_proto->set_table_name(table_name);
+    graph_proto->set_table_type(table_type);
+    graph_proto->set_use_cache(false);
     // Set GraphTable Parameter
-    common_proto->set_table_name(table_name);
-    common_proto->set_name(table_type);
+    // common_proto->set_table_name(table_name);
+    // common_proto->set_name(table_type);
+    // for (size_t i = 0; i < feat_name.size(); i++) {
+    //   common_proto->add_params(feat_dtype[i]);
+    //   common_proto->add_dims(feat_shape[i]);
+    //   common_proto->add_attributes(feat_name[i]);
+    // }
+
     for (size_t i = 0; i < feat_name.size(); i++) {
-      common_proto->add_params(feat_dtype[i]);
-      common_proto->add_dims(feat_shape[i]);
-      common_proto->add_attributes(feat_name[i]);
+      graph_feature->add_dtype(feat_dtype[i]);
+      graph_feature->add_shape(feat_shape[i]);
+      graph_feature->add_name(feat_name[i]);
     }
-
     accessor_proto->set_accessor_class("CommMergeAccessor");
   }
 
@@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService {
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
   void clear_nodes(std::string name);
-  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+  void add_graph_node(std::string name, std::vector<int64_t>& node_ids,
                       std::vector<bool>& weight_list);
-  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
+  void remove_graph_node(std::string name, std::vector<int64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
-  batch_sample_neighbors(std::string name, std::vector<uint64_t> node_ids,
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
+  batch_sample_neighbors(std::string name, std::vector<int64_t> node_ids,
                          int sample_size, bool return_weight,
                          bool return_edges);
-  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
-                                            int sample_size);
+  std::vector<int64_t> random_sample_nodes(std::string name, int server_index,
+                                           int sample_size);
   std::vector<std::vector<std::string>> get_node_feat(
-      std::string node_type, std::vector<uint64_t> node_ids,
+      std::string node_type, std::vector<int64_t> node_ids,
       std::vector<std::string> feature_names);
   void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
                                   size_t ttl);
-  void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
+  void set_node_feat(std::string node_type, std::vector<int64_t> node_ids,
                      std::vector<std::string> feature_names,
                      const std::vector<std::vector<std::string>> features);
   std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index be916bf2e800308cdebbbfbe4e5ff4c467cf3f6f..2fa5ecb4051c568fa0697b236bcfb9c00e4319bf 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
-
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 54b98cb96ce5196bb5133f777b2571f4d3d43c6e..2c07bd65d63d408b1bff12eda7bcf8fba3336db6 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -27,6 +27,288 @@
 namespace paddle {
 namespace distributed {
 
+#ifdef PADDLE_WITH_HETERPS
+
+int CompleteGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  std::cout << "in graph sampling" << std::endl;
+  sample_nodes.clear();
+  sample_neighbors.clear();
+  sample_res.clear();
+  sample_nodes.resize(gpu_num);
+  sample_neighbors.resize(gpu_num);
+  sample_res.resize(gpu_num);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+      sample_nodes_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+      graph_table->task_pool_size_);
+  for (int i = 0; i < graph_table->task_pool_size_; i++) {
+    sample_nodes_ex[i].resize(gpu_num);
+    sample_neighbors_ex[i].resize(gpu_num);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              paddle::framework::GpuPsGraphNode node;
+              std::vector<Node *> &v =
+                  this->graph_table->shards[i]->get_bucket();
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (size_t j = 0; j < v.size(); j++) {
+                size_t location = v[j]->get_id() % this->gpu_num;
+                node.node_id = v[j]->get_id();
+                node.neighbor_size = v[j]->get_neighbor_size();
+                node.neighbor_offset =
+                    (int)sample_neighbors_ex[ind][location].size();
+                sample_nodes_ex[ind][location].emplace_back(node);
+                for (int k = 0; k < node.neighbor_size; k++)
+                  sample_neighbors_ex[ind][location].push_back(
+                      v[j]->get_neighbor_id(k));
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  tasks.clear();
+  for (size_t i = 0; i < gpu_num; i++) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              int total_offset = 0;
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
+                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                }
+                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                total_offset += neighbor_size;
+                for (size_t k = 0; k < neighbor_size; k++) {
+                  sample_neighbors[ind].push_back(
+                      sample_neighbors_ex[j][ind][k]);
+                }
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+
+  if (this->status == GraphSamplerStatus::terminating) {
+    pthread_rwlock_unlock(rw_lock);
+    return 0;
+  }
+  for (size_t i = 0; i < gpu_num; i++) {
+    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].neighbor_list = sample_neighbors[i].data();
+    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].neighbor_size = sample_neighbors[i].size();
+  }
+  pthread_rwlock_unlock(rw_lock);
+  if (this->status == GraphSamplerStatus::terminating) {
+    return 0;
+  }
+  callback(sample_res);
+  return 0;
+}
+void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+}
+
+int BasicBfsGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  while (rounds > 0 && status == GraphSamplerStatus::running) {
+    for (size_t i = 0; i < sample_neighbors_map.size(); i++) {
+      sample_neighbors_map[i].clear();
+    }
+    sample_neighbors_map.clear();
+    std::vector<int> nodes_left(graph_table->shards.size(),
+                                node_num_for_each_shard);
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    sample_neighbors_map.resize(graph_table->task_pool_size_);
+    int task_size = 0;
+    std::vector<std::future<int>> tasks;
+    int init_size = 0;
+    //__sync_fetch_and_add
+    std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
+      VLOG(0) << "in bfs " << i << " " << id;
+      if (this->status == GraphSamplerStatus::terminating) {
+        int task_left = __sync_sub_and_fetch(&task_size, 1);
+        if (task_left == 0) {
+          prom.set_value(0);
+        }
+        return 0;
+      }
+      size_t ind = i % this->graph_table->task_pool_size_;
+      if (nodes_left[i] > 0) {
+        nodes_left[i]--;
+        auto iter = sample_neighbors_map[ind].find(id);
+        if (iter == sample_neighbors_map[ind].end()) {
+          sample_neighbors_map[ind][id] = std::vector<int64_t>();
+          iter = sample_neighbors_map[ind].find(id);
+          Node *node = graph_table->shards[i]->find_node(id);
+          if (node != NULL) {
+            size_t edge_fetch_size =
+                std::min((size_t) this->edge_num_for_each_node,
+                         node->get_neighbor_size());
+            for (size_t k = 0; k < edge_fetch_size; k++) {
+              int64_t neighbor_id = node->get_neighbor_id(k);
+              int node_location = neighbor_id % this->graph_table->shard_num %
+                                  this->graph_table->task_pool_size_;
+              __sync_add_and_fetch(&task_size, 1);
+              graph_table->_shards_task_pool[node_location]->enqueue(
+                  bfs, neighbor_id % this->graph_table->shard_num, neighbor_id);
+              iter->second.push_back(neighbor_id);
+            }
+          }
+        }
+      }
+      int task_left = __sync_sub_and_fetch(&task_size, 1);
+      if (task_left == 0) {
+        prom.set_value(0);
+      }
+      return 0;
+    };
+    for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+      std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
+      if (v.size() > 0) {
+        init_size++;
+        __sync_add_and_fetch(&task_size, 1);
+        int64_t id = v[0]->get_id();
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue(bfs, i, id);
+      }  // if
+    }
+    if (init_size == 0) {
+      prom.set_value(0);
+    }
+    fut.get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    std::cout << "bfs over" << std::endl;
+    sample_nodes.clear();
+    sample_neighbors.clear();
+    sample_res.clear();
+    sample_nodes.resize(gpu_num);
+    sample_neighbors.resize(gpu_num);
+    sample_res.resize(gpu_num);
+    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+        sample_nodes_ex(graph_table->task_pool_size_);
+    std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+        graph_table->task_pool_size_);
+    for (int i = 0; i < graph_table->task_pool_size_; i++) {
+      sample_nodes_ex[i].resize(gpu_num);
+      sample_neighbors_ex[i].resize(gpu_num);
+    }
+    tasks.clear();
+    for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+            if (this->status == GraphSamplerStatus::terminating) {
+              return 0;
+            }
+            paddle::framework::GpuPsGraphNode node;
+            auto iter = sample_neighbors_map[i].begin();
+            size_t ind = i;
+            for (; iter != sample_neighbors_map[i].end(); iter++) {
+              size_t location = iter->first % this->gpu_num;
+              node.node_id = iter->first;
+              node.neighbor_size = iter->second.size();
+              node.neighbor_offset =
+                  (int)sample_neighbors_ex[ind][location].size();
+              sample_nodes_ex[ind][location].emplace_back(node);
+              for (auto k : iter->second)
+                sample_neighbors_ex[ind][location].push_back(k);
+            }
+            return 0;
+          }));
+    }
+
+    for (size_t i = 0; i < tasks.size(); i++) {
+      tasks[i].get();
+      sample_neighbors_map[i].clear();
+    }
+    tasks.clear();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    for (size_t i = 0; i < gpu_num; i++) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+              ->enqueue([&, i, this]() -> int {
+                if (this->status == GraphSamplerStatus::terminating) {
+                  pthread_rwlock_unlock(rw_lock);
+                  return 0;
+                }
+                int total_offset = 0;
+                size_t ind = i % graph_table->task_pool_size_;
+                for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                  for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                    sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]);
+                    sample_nodes[i].back().neighbor_offset += total_offset;
+                    // neighbor_offset[i].push_back(total_offset +
+                    // neighbor_offset_ex[j][i][k]);
+                  }
+                  size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                  total_offset += neighbor_size;
+                  for (size_t k = 0; k < neighbor_size; k++) {
+                    sample_neighbors[ind].push_back(
+                        sample_neighbors_ex[j][ind][k]);
+                  }
+                }
+                return 0;
+              }));
+    }
+    for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    // int64_t total_neighbors =
+    // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0);
+    for (size_t i = 0; i < gpu_num; i++) {
+      sample_res[i].node_list = sample_nodes[i].data();
+      sample_res[i].neighbor_list = sample_neighbors[i].data();
+      sample_res[i].node_size = sample_nodes[i].size();
+      sample_res[i].neighbor_size = sample_neighbors[i].size();
+    }
+    pthread_rwlock_unlock(rw_lock);
+    if (this->status == GraphSamplerStatus::terminating) {
+      return 0;
+    }
+    callback(sample_res);
+    rounds--;
+    if (rounds > 0) {
+      for (int i = 0;
+           i < interval && this->status == GraphSamplerStatus::running; i++) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+  }
+  return 0;
+}
+void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+  node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
+  edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
+  rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
+  interval = args.size() > 3 ? std::stoi(args[3]) : 60;
+}
+
+#endif
+
 std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
   if (start < 0) start = 0;
   std::vector<Node *> res;
@@ -38,10 +320,10 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
-int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
                                    std::vector<bool> &is_weight_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) {
@@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
   return 0;
 }
 
-int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(std::vector<int64_t> &id_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  std::vector<std::vector<int64_t>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -98,7 +380,7 @@ void GraphShard::clear() {
 
 GraphShard::~GraphShard() { clear(); }
 
-void GraphShard::delete_node(uint64_t id) {
+void GraphShard::delete_node(int64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
   int pos = iter->second;
@@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) {
   node_location.erase(id);
   bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(uint64_t id) {
+GraphNode *GraphShard::add_graph_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new GraphNode(id));
@@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
   }
   return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+FeatureNode *GraphShard::add_feature_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new FeatureNode(id));
@@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   return (FeatureNode *)bucket[node_location[id]];
 }
 
-void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
+void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
   find_node(id)->add_edge(dst_id, weight);
 }
 
-Node *GraphShard::find_node(uint64_t id) {
+Node *GraphShard::find_node(int64_t id) {
   auto iter = node_location.find(id);
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string &param) {
 }
 
 int32_t GraphTable::get_nodes_ids_by_ranges(
-    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+    std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res) {
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
-  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  std::vector<std::future<std::vector<int64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i]->get_size();
     start = total_size;
-    while (start < end && index < ranges.size()) {
+    while (start < end && index < (int)ranges.size()) {
       if (ranges[index].second <= start)
         index++;
       else if (ranges[index].first >= end) {
@@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         first -= total_size;
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [this, first, second, i]() -> std::vector<uint64_t> {
+            [this, first, second, i]() -> std::vector<int64_t> {
               return shards[i]->get_ids_by_range(first, second);
             }));
       }
@@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 }
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+#endif
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   /*-----------------------
   relocate the duplicate nodes to make them distributed evenly among threads.
 */
+  if (!use_duplicate_nodes) {
+#ifdef PADDLE_WITH_HETERPS
+    if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
+
+    return 0;
+  }
   for (auto &shard : extra_shards) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   int size = extra_nodes_to_thread_index.size();
   if (size == 0) return 0;
   std::vector<int> index;
-  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  for (int i = 0; i < (int)used.size(); i++) index.push_back(i);
   sort(index.begin(), index.end(),
        [&](int &a, int &b) { return used[a] < used[b]; });
 
   std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
   int t = 1, aim = 0, mod = 0;
-  for (; t < used.size(); t++) {
+  for (; t < (int)used.size(); t++) {
     if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
       break;
     } else {
@@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     if (t - x <= mod) alloc[index[x]]++;
     alloc[index[x]] -= used[index[x]];
   }
-  std::vector<uint64_t> vec[index.size()];
+  std::vector<int64_t> vec[index.size()];
   for (auto p : extra_nodes_to_thread_index) {
     has_alloc[p.second]++;
     vec[p.second].push_back(p.first);
@@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
                      has_alloc[index[right]] - alloc[index[right]]);
     has_alloc[index[left]] += x;
     has_alloc[index[right]] -= x;
-    uint64_t id;
+    int64_t id;
     while (x--) {
       id = vec[index[right]].back();
       vec[index[right]].pop_back();
@@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     delete extra_shards[i];
     extra_shards[i] = extra_shards_copy[i];
   }
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
   return 0;
 }
 
-Node *GraphTable::find_node(uint64_t id) {
+Node *GraphTable::find_node(int64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
     if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
@@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) {
   Node *node = shards[index]->find_node(id);
   return node;
 }
-uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
   if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
     return node_id % shard_num % shard_num_per_server % task_pool_size_;
   size_t src_shard_id = node_id % shard_num;
@@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
-uint32_t GraphTable::get_thread_pool_index_by_shard_index(
-    uint64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
   return shard_index % shard_num_per_server % task_pool_size_;
 }
 
@@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
   int total_size = 0;
-  for (int i = 0; i < shards.size(); i++) {
+  for (int i = 0; i < (int)shards.size(); i++) {
     total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
@@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
     }
   }
   for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<uint64_t> res;
+  std::vector<int64_t> res;
   get_nodes_ids_by_ranges(second_half, res);
-  actual_size = res.size() * sizeof(uint64_t);
+  actual_size = res.size() * sizeof(int64_t);
   buffer.reset(new char[actual_size]);
   char *pointer = buffer.get();
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    uint64_t *node_ids, int sample_size,
+    int64_t *node_ids, int sample_size,
     std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
     bool need_weight) {
   size_t node_num = buffers.size();
@@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors(
     seq_id[index].emplace_back(idx);
     id_list[index].emplace_back(node_ids[idx], sample_size, need_weight);
   }
-  for (int i = 0; i < seq_id.size(); i++) {
+  for (int i = 0; i < (int)seq_id.size(); i++) {
     if (seq_id[i].size() == 0) continue;
     tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      uint64_t node_id;
+      int64_t node_id;
       std::vector<std::pair<SampleKey, SampleResult>> r;
       LRUResponse response = LRUResponse::blocked;
       if (use_cache) {
@@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors(
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
       for (size_t k = 0; k < id_list[i].size(); k++) {
-        if (index < r.size() &&
+        if (index < (int)r.size() &&
             r[index].first.node_key == id_list[i][k].node_key) {
           idx = seq_id[i][k];
           actual_sizes[idx] = r[index].second.actual_size;
@@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors(
               res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                         : Node::id_size);
           int offset = 0;
-          uint64_t id;
+          int64_t id;
           float weight;
           char *buffer_addr = new char[actual_size];
           if (response == LRUResponse::ok) {
@@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors(
   return 0;
 }
 
-int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+int32_t GraphTable::get_node_feat(const std::vector<int64_t> &node_ids,
                                   const std::vector<std::string> &feature_names,
                                   std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           Node *node = find_node(node_id);
@@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
           if (node == nullptr) {
             return 0;
           }
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               // res[feat_idx][idx] =
@@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
 }
 
 int32_t GraphTable::set_node_feat(
-    const std::vector<uint64_t> &node_ids,
+    const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
           auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]);
@@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   return 0;
 }
 
-int32_t GraphTable::get_server_index_by_id(uint64_t id) {
+int32_t GraphTable::get_server_index_by_id(int64_t id) {
   return id % shard_num / shard_num_per_server;
 }
+int32_t GraphTable::initialize(const TableParameter &config,
+                               const FsClientParameter &fs_config) {
+  LOG(INFO) << "in graphTable initialize";
+  _config = config;
+  if (initialize_accessor() != 0) {
+    LOG(WARNING) << "Table accessor initialize failed";
+    return -1;
+  }
 
-int32_t GraphTable::initialize() {
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
+  auto graph = config.graph_parameter();
+  shard_num = _config.shard_num();
+  LOG(INFO) << "in graphTable initialize over";
+  return initialize(graph);
+}
+int32_t GraphTable::initialize(const GraphParameter &graph) {
+#ifdef PADDLE_WITH_HETERPS
+  if (graph.gpups_mode()) {
+    gpups_mode = true;
+    if (shard_num == 0) {
+      shard_num = graph.gpups_mode_shard_num();
+      server_num = 1;
+      _shard_idx = 0;
+    }
+    auto *sampler =
+        CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
+    auto slices =
+        string::split_string<std::string>(graph.gpups_graph_sample_args(), ",");
+    std::cout << "slices" << std::endl;
+    for (auto x : slices) std::cout << x << std::endl;
+    sampler->init(graph.gpu_num(), this, slices);
+    graph_sampler.reset(sampler);
+  }
+#endif
+  task_pool_size_ = graph.task_pool_size();
   _shards_task_pool.resize(task_pool_size_);
   for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
     _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
   }
-  server_num = _shard_num;
-  // VLOG(0) << "in init graph table server num = " << server_num;
-  /*
-  _shard_num is actually server number here
-  when a server initialize its tables, it sets tables' _shard_num to server_num,
-  and _shard_idx to server
-  rank
-  */
-  auto common = _config.common();
-
-  this->table_name = common.table_name();
-  this->table_type = common.name();
+  auto graph_feature = graph.graph_feature();
+  // this->table_name = common.table_name();
+  // this->table_type = common.name();
+  this->table_name = graph.table_name();
+  this->table_type = graph.table_type();
   VLOG(0) << " init graph table type " << this->table_type << " table name "
           << this->table_name;
-  int feat_conf_size = static_cast<int>(common.attributes().size());
+  // int feat_conf_size = static_cast<int>(common.attributes().size());
+  int feat_conf_size = static_cast<int>(graph_feature.name().size());
   for (int i = 0; i < feat_conf_size; i++) {
-    auto &f_name = common.attributes()[i];
-    auto &f_shape = common.dims()[i];
-    auto &f_dtype = common.params()[i];
+    // auto &f_name = common.attributes()[i];
+    // auto &f_shape = common.dims()[i];
+    // auto &f_dtype = common.params()[i];
+    auto &f_name = graph_feature.name()[i];
+    auto &f_shape = graph_feature.shape()[i];
+    auto &f_dtype = graph_feature.dtype()[i];
     this->feat_name.push_back(f_name);
     this->feat_shape.push_back(f_shape);
     this->feat_dtype.push_back(f_dtype);
@@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() {
     VLOG(0) << "init graph table feat conf name:" << f_name
             << " shape:" << f_shape << " dtype:" << f_dtype;
   }
-
-  shard_num = _config.shard_num();
   VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
           << _shard_idx;
   shard_num_per_server = sparse_local_shard_num(shard_num, server_num);
@@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() {
 
   return 0;
 }
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index c76a62248c8fcab677d3afd8b3985700ca5f2f33..7946569525cc4bb1351046632dfe5894611c4b67 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -38,10 +38,14 @@
 #include <vector>
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#endif
 namespace paddle {
 namespace distributed {
 class GraphShard {
@@ -51,37 +55,37 @@ class GraphShard {
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<uint64_t> get_ids_by_range(int start, int end) {
-    std::vector<uint64_t> res;
+  std::vector<int64_t> get_ids_by_range(int start, int end) {
+    std::vector<int64_t> res;
     for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
 
-  GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(uint64_t id);
-  Node *find_node(uint64_t id);
-  void delete_node(uint64_t id);
+  FeatureNode *add_feature_node(int64_t id);
+  Node *find_node(int64_t id);
+  void delete_node(int64_t id);
   void clear();
-  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> &get_node_location() {
+  void add_neighbor(int64_t id, int64_t dst_id, float weight);
+  std::unordered_map<int64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
-  std::unordered_map<uint64_t, int> node_location;
+  std::unordered_map<int64_t, int> node_location;
   std::vector<Node *> bucket;
 };
 
 enum LRUResponse { ok = 0, blocked = 1, err = 2 };
 
 struct SampleKey {
-  uint64_t node_key;
+  int64_t node_key;
   size_t sample_size;
   bool is_weighted;
-  SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted)
+  SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted)
       : node_key(_node_key),
         sample_size(_sample_size),
         is_weighted(_is_weighted) {}
@@ -300,7 +304,7 @@ class ScaledLRU {
       node_size += lru_pool[i].node_size - lru_pool[i].remove_count;
     }
 
-    if (node_size <= size_t(1.1 * size_limit) + 1) return 0;
+    if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0;
     if (pthread_rwlock_wrlock(&rwlock) == 0) {
       // VLOG(0)<"in shrink\n";
       global_count = 0;
@@ -308,9 +312,9 @@ class ScaledLRU {
         global_count += lru_pool[i].node_size - lru_pool[i].remove_count;
       }
       // VLOG(0)<<"global_count "<<global_count<<"\n";
-      if (global_count > size_limit) {
+      if ((size_t)global_count > size_limit) {
         size_t remove = global_count - size_limit;
-        for (int i = 0; i < lru_pool.size(); i++) {
+        for (size_t i = 0; i < lru_pool.size(); i++) {
           lru_pool[i].total_diff = 0;
           lru_pool[i].remove_count +=
               1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) /
@@ -352,9 +356,69 @@ class ScaledLRU {
   friend class RandomSampleLRU<K, V>;
 };
 
+#ifdef PADDLE_WITH_HETERPS
+enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 };
+class GraphTable;
+class GraphSampler {
+ public:
+  GraphSampler() {
+    status = GraphSamplerStatus::waiting;
+    thread_pool.reset(new ::ThreadPool(1));
+    callback = [](std::vector<paddle::framework::GpuPsCommGraph> &res) {
+      return;
+    };
+  }
+  virtual int run_graph_sampling() = 0;
+  virtual int start_graph_sampling() {
+    if (status != GraphSamplerStatus::waiting) {
+      return -1;
+    }
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    graph_sample_task_over = thread_pool->enqueue([&prom, this]() {
+      prom.set_value(0);
+      status = GraphSamplerStatus::running;
+      return run_graph_sampling();
+    });
+    return fut.get();
+  }
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args) = 0;
+  virtual void set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    this->callback = callback;
+  }
+
+  virtual int end_graph_sampling() {
+    if (status == GraphSamplerStatus::running) {
+      status = GraphSamplerStatus::terminating;
+      return graph_sample_task_over.get();
+    }
+    return -1;
+  }
+  virtual GraphSamplerStatus get_graph_sampler_status() { return status; }
+
+ protected:
+  std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+      callback;
+  std::shared_ptr<::ThreadPool> thread_pool;
+  GraphSamplerStatus status;
+  std::future<int> graph_sample_task_over;
+  std::vector<paddle::framework::GpuPsCommGraph> sample_res;
+};
+#endif
+
 class GraphTable : public SparseTable {
  public:
-  GraphTable() { use_cache = false; }
+  GraphTable() {
+    use_cache = false;
+    shard_num = 0;
+#ifdef PADDLE_WITH_HETERPS
+    gpups_mode = false;
+#endif
+    rw_lock.reset(new pthread_rwlock_t());
+  }
   virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
@@ -362,7 +426,7 @@ class GraphTable : public SparseTable {
                                   int step);
 
   virtual int32_t random_sample_neighbors(
-      uint64_t *node_ids, int sample_size,
+      int64_t *node_ids, int sample_size,
       std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes, bool need_weight);
 
@@ -370,9 +434,11 @@ class GraphTable : public SparseTable {
                               int &actual_sizes);
 
   virtual int32_t get_nodes_ids_by_ranges(
-      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
-  virtual int32_t initialize();
-
+      std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res);
+  virtual int32_t initialize() { return 0; }
+  virtual int32_t initialize(const TableParameter &config,
+                             const FsClientParameter &fs_config);
+  virtual int32_t initialize(const GraphParameter &config);
   int32_t load(const std::string &path, const std::string &param);
   int32_t load_graph_split_config(const std::string &path);
 
@@ -380,13 +446,13 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
-  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+  int32_t add_graph_node(std::vector<int64_t> &id_list,
                          std::vector<bool> &is_weight_list);
 
-  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+  int32_t remove_graph_node(std::vector<int64_t> &id_list);
 
-  int32_t get_server_index_by_id(uint64_t id);
-  Node *find_node(uint64_t id);
+  int32_t get_server_index_by_id(int64_t id);
+  Node *find_node(int64_t id);
 
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) {
@@ -407,16 +473,27 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
-  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
-  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int32_t set_shard(size_t shard_idx, size_t server_num) {
+    _shard_idx = shard_idx;
+    /*
+    _shard_num is not used in graph_table, this following operation is for the
+    purpose of
+    being compatible with base class table.
+    */
+    _shard_num = server_num;
+    this->server_num = server_num;
+    return 0;
+  }
+  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
+  virtual uint32_t get_thread_pool_index(int64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
-  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+  virtual int32_t get_node_feat(const std::vector<int64_t> &node_ids,
                                 const std::vector<std::string> &feature_names,
                                 std::vector<std::vector<std::string>> &res);
 
   virtual int32_t set_node_feat(
-      const std::vector<uint64_t> &node_ids,
+      const std::vector<int64_t> &node_ids,
       const std::vector<std::string> &feature_names,
       const std::vector<std::vector<std::string>> &res);
 
@@ -433,11 +510,25 @@ class GraphTable : public SparseTable {
     }
     return 0;
   }
-
+#ifdef PADDLE_WITH_HETERPS
+  virtual int32_t start_graph_sampling() {
+    return this->graph_sampler->start_graph_sampling();
+  }
+  virtual int32_t end_graph_sampling() {
+    return this->graph_sampler->end_graph_sampling();
+  }
+  virtual int32_t set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    graph_sampler->set_graph_sample_callback(callback);
+    return 0;
+  }
+// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
+#endif
  protected:
   std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  const int task_pool_size_ = 24;
+  int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
@@ -450,11 +541,61 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<uint64_t> extra_nodes;
-  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<int64_t> extra_nodes;
+  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
   bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+#ifdef PADDLE_WITH_HETERPS
+  // paddle::framework::GpuPsGraphTable gpu_graph_table;
+  bool gpups_mode;
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  std::shared_ptr<GraphSampler> graph_sampler;
+  REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+#endif
+};
+
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_REGISTERER(GraphSampler);
+class CompleteGraphSampler : public GraphSampler {
+ public:
+  CompleteGraphSampler() {}
+  ~CompleteGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  // std::vector<GpuPsCommGraph> sample_res;
+  // std::shared_ptr<std::mt19937_64> random;
+  int gpu_num;
+};
+
+class BasicBfsGraphSampler : public GraphSampler {
+ public:
+  BasicBfsGraphSampler() {}
+  ~BasicBfsGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  size_t gpu_num;
+  int node_num_for_each_shard, edge_num_for_each_node;
+  int rounds, interval;
+  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+      sample_neighbors_map;
 };
+#endif
 }  // namespace distributed
 
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf59dbacb253707efdc527a23232fcb6c11554b4
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
+#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
+#define DECLARE_2_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_3_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_4_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_5_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_6_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_7_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_8_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_9_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_10_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_11_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
+#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
+  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index d1961b655d8829716b392c24ad6f1139089eb80d..004a536e8e56c28151986d56833a5708999e297c 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -17,11 +17,11 @@
 namespace paddle {
 namespace distributed {
 
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
 }
 
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 3dfe5a6f357a7cd7d79834a20b6411995665f4fa..5fc785fe25682c8ff8de6606581cf7a13ae52999 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -24,19 +24,20 @@ class GraphEdgeBlob {
   GraphEdgeBlob() {}
   virtual ~GraphEdgeBlob() {}
   size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual void add_edge(int64_t id, float weight);
+  int64_t get_id(int idx) { return id_arr[idx]; }
   virtual float get_weight(int idx) { return 1; }
+  std::vector<int64_t>& export_id_array() { return id_arr; }
 
  protected:
-  std::vector<uint64_t> id_arr;
+  std::vector<int64_t> id_arr;
 };
 
 class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  public:
   WeightedGraphEdgeBlob() {}
   virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
+  virtual void add_edge(int64_t id, float weight);
   virtual float get_weight(int idx) { return weight_arr[idx]; }
 
  protected:
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index b838c2c1258d84fec8c4a25f5855209d5b428d4c..c6c594036d4fc94b296c0801b05c05801beb4fc0 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -48,6 +48,7 @@ class Node {
   virtual void set_feature(int idx, std::string str) {}
   virtual void set_feature_size(int size) {}
   virtual int get_feature_size() { return 0; }
+  virtual size_t get_neighbor_size() { return 0; }
 
  protected:
   uint64_t id;
@@ -70,6 +71,7 @@ class GraphNode : public Node {
   }
   virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
   virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+  virtual size_t get_neighbor_size() { return edges->size(); }
 
  protected:
   Sampler *sampler;
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index fa8169da07ab7fdf7ed28c840f062741913a8702..fc2ea56e95d7721fdba10e8499c22ca98bbd4c3a 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 #ifdef PADDLE_WITH_HETERPS
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler);
+REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler);
 #endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 2223334ccc442f5e53805ac8c078df07155565a8..cb46c38d4de4b7546af3e3f9e973ee2accba1921 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv
 set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 9949dce4e933b03da4260c34b3beaf2b7bcdc4f1..a2f495de3c953a418f6e9c57a0535264eb401e65 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -236,7 +236,7 @@ void RunGraphSplit() {
   sleep(2);
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -250,16 +250,16 @@ void RunGraphSplit() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   _vs.clear();
   vs.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 97), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(3, _vs[0].size());
   std::remove(edge_file_name);
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 22c2d1e60992e2955824f004fbb89ea6c22da823..565d51379d5a8519de241deea192ffbdbfa49fd0 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -48,10 +48,10 @@ namespace distributed = paddle::distributed;
 
 void testSampleNodes(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<uint64_t> ids;
+  std::vector<int64_t> ids;
   auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {37, 59};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {37, 59};
   pull_status.wait();
   for (auto id : ids) s.insert(id);
   ASSERT_EQ(true, s.size() == s1.size());
@@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() {
 
 void testSingleSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
   auto pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 37), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 37), 4, vs, vs1, true);
   pull_status.wait();
 
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -126,7 +126,7 @@ void testSingleSampleNeighboor(
   vs.clear();
   vs1.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 96), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 96), 4, vs, vs1, true);
   pull_status.wait();
   s1 = {111, 48, 247};
   for (auto g : vs[0]) {
@@ -147,30 +147,30 @@ void testAddNode(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   worker_ptr_->clear_nodes(0);
   int total_num = 270000;
-  uint64_t id;
-  std::unordered_set<uint64_t> id_set;
+  int64_t id;
+  std::unordered_set<int64_t> id_set;
   for (int i = 0; i < total_num; i++) {
     while (id_set.find(id = rand()) != id_set.end())
       ;
     id_set.insert(id);
   }
-  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<int64_t> id_list(id_set.begin(), id_set.end());
   std::vector<bool> weight_list;
   auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
   status.wait();
-  std::vector<uint64_t> ids[2];
+  std::vector<int64_t> ids[2];
   for (int i = 0; i < 2; i++) {
     auto sample_status =
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check.insert(x);
   ASSERT_EQ(id_set.size(), id_set_check.size());
   for (auto x : id_set) {
     ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
   }
-  std::vector<uint64_t> remove_ids;
+  std::vector<int64_t> remove_ids;
   for (auto p : id_set_check) {
     if (remove_ids.size() == 0)
       remove_ids.push_back(p);
@@ -187,7 +187,7 @@ void testAddNode(
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check1(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check1.insert(x);
   ASSERT_EQ(id_set_check1.size(), id_set_check.size());
   for (auto x : id_set_check1) {
@@ -196,14 +196,14 @@ void testAddNode(
 }
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
-  std::vector<std::uint64_t> v = {37, 96};
+  std::vector<std::int64_t> v = {37, 96};
   auto pull_status =
       worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false);
   pull_status.wait();
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -417,7 +417,7 @@ void RunBrpcPushSparse() {
 
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -427,14 +427,14 @@ void RunBrpcPushSparse() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   testSampleNodes(worker_ptr_);
   sleep(5);
   testSingleSampleNeighboor(worker_ptr_);
   testBatchSampleNeighboor(worker_ptr_);
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   paddle::distributed::GraphTable* g =
@@ -445,14 +445,14 @@ void RunBrpcPushSparse() {
   while (round--) {
     vs.clear();
     pull_status = worker_ptr_->batch_sample_neighbors(
-        0, std::vector<uint64_t>(1, 37), 1, _vs, vs, false);
+        0, std::vector<int64_t>(1, 37), 1, _vs, vs, false);
     pull_status.wait();
 
     for (int i = 0; i < ttl; i++) {
-      std::vector<std::vector<uint64_t>> vs1;
+      std::vector<std::vector<int64_t>> vs1;
       std::vector<std::vector<float>> vs2;
       pull_status = worker_ptr_->batch_sample_neighbors(
-          0, std::vector<uint64_t>(1, 37), 1, vs1, vs2, false);
+          0, std::vector<int64_t>(1, 37), 1, vs1, vs2, false);
       pull_status.wait();
       ASSERT_EQ(_vs[0].size(), vs1[0].size());
 
@@ -540,7 +540,7 @@ void RunBrpcPushSparse() {
 
   // Test Pull by step
 
-  std::unordered_set<uint64_t> count_item_nodes;
+  std::unordered_set<int64_t> count_item_nodes;
   // pull by step 2
   for (int test_step = 1; test_step < 4; test_step++) {
     count_item_nodes.clear();
@@ -558,18 +558,18 @@ void RunBrpcPushSparse() {
     ASSERT_EQ(count_item_nodes.size(), 12);
   }
 
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res = client1.batch_sample_neighbors(
-      std::string("user2item"), std::vector<uint64_t>(1, 96), 4, true, false);
+      std::string("user2item"), std::vector<int64_t>(1, 96), 4, true, false);
   ASSERT_EQ(res.first[0].size(), 3);
-  std::vector<uint64_t> node_ids;
+  std::vector<int64_t> node_ids;
   node_ids.push_back(96);
   node_ids.push_back(37);
   res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4,
                                        true, false);
 
   ASSERT_EQ(res.first[1].size(), 1);
-  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  std::vector<int64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
   ASSERT_EQ(nodes_ids.size(), 2);
   ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
                       (nodes_ids[0] == 37 && nodes_ids[1] == 59));
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65455028247ddf7d310040ecae0018b619f75bf1
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include <chrono>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+// odd id:96 48 122 112
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+
+void testGraphSample() {
+#ifdef PADDLE_WITH_HETERPS
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(2);
+
+  distributed::GraphTable graph_table, graph_table1;
+  graph_table.initialize(table_proto);
+  prepare_file(edge_file_name, edges);
+  graph_table.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res;
+  std::promise<int> prom;
+  std::future<int> fut = prom.get_future();
+  graph_table.set_graph_sample_callback(
+      [&res, &prom](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res = res0;
+        prom.set_value(0);
+      });
+  graph_table.start_graph_sampling();
+  fut.get();
+  graph_table.end_graph_sampling();
+  ASSERT_EQ(2, res.size());
+  // 37 59 97
+  for (int i = 0; i < (int)res[1].node_size; i++) {
+    std::cout << res[1].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(3, res[1].node_size);
+
+  ::paddle::distributed::GraphParameter table_proto1;
+  table_proto1.set_gpups_mode(true);
+  table_proto1.set_gpups_mode_shard_num(127);
+  table_proto1.set_gpu_num(2);
+  table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto1.set_gpups_graph_sample_args("5,5,1,1");
+  graph_table1.initialize(table_proto1);
+  graph_table1.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res1;
+  std::promise<int> prom1;
+  std::future<int> fut1 = prom1.get_future();
+  graph_table1.set_graph_sample_callback(
+      [&res1, &prom1](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res1 = res0;
+        prom1.set_value(0);
+      });
+  graph_table1.start_graph_sampling();
+  fut1.get();
+  graph_table1.end_graph_sampling();
+  // distributed::BasicBfsGraphSampler *sampler1 =
+  //     (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler();
+  //     sampler1->start_graph_sampling();
+  //     std::this_thread::sleep_for (std::chrono::seconds(1));
+  // std::vector<paddle::framework::GpuPsCommGraph> res1;// =
+  // sampler1->fetch_sample_res();
+  ASSERT_EQ(2, res1.size());
+  // odd id:96 48 122 112
+  for (int i = 0; i < (int)res1[0].node_size; i++) {
+    std::cout << res1[0].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(4, res1[0].node_size);
+#endif
+}
+
+TEST(testGraphSample, Run) { testGraphSample(); }
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 698a698fc6d18492faac771e6e0e079a35953504..691a381405e9a792d1ee0f256647405a3739e9d8 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,6 +1,7 @@
-set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps dygraph_function dygraph_node)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
@@ -9,6 +10,8 @@ endif()
 
 add_subdirectory(api)
 add_subdirectory(accumulation)
+add_subdirectory(custom_operator)
+
 
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 3a2ec403c0a59aaa23decc72fb9581b5a7f78343..9c4089af092e418d6845864671124917c6498cf1 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
-
+DECLARE_bool(retain_grad_for_all_tensor);
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -62,7 +62,7 @@ operator()(
     grad_out = grads[0][0];
   }
 
-  if (!weak_grad_.expired()) {
+  if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
     auto grad = weak_grad_.lock();
     CopyOrAddTensor(grad.get(), grad_out);
   }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 07fa40165167ce2352018c0e1b1cb08222d5a181..a91a0b6e34c0d9440e3645d1a6982748c4315962 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 5a2595b9103e4d49845fa8938ee3577b6b3f3f06..0bc998a03a80b7b8a1e486ad68f1575c130d2c1b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   // 1. Check Output Size
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 247fde6ed1f869542969b068cdae9f59cedd732a..e263f73a6b8a4a1f9ce23d9b5ca383fd6828016b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   void SetTensorWrappers_X(
       const std::vector<paddle::experimental::Tensor>& tensors);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index ba6a936d68651c0bcf3815eab58b5a6e66d7024c..1be3b31de00a6bb94b8ad16bff4bf9c1fa61123f 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x,
     scale_node->SetTensorWrappers_X({x});
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0);
+    scale_node->SetGradOutMeta(x, /*slot id*/ 0);
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0);
+    scale_node->SetGradInMeta(out, /*slot id*/ 0);
 
     // Set History for output set current Grad Node for
     EagerUtils::SetHistory(p_autograd_out, scale_node);
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 00578d9a359a3b8d57148efc959de553e811f541..a9a62fcd50e7a0648e695d1f60d52d3f936c53ed 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -18,7 +18,7 @@
 #include <atomic>
 #include <memory>
 #include "paddle/fluid/imperative/tracer.h"
-
+#include "paddle/phi/api/ext/op_meta_info.h"
 namespace egr {
 
 class UniqueNameGenerator {
@@ -70,6 +70,21 @@ class Controller {
 
   void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; }
 
+  const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
+  GetOpMetaInfoMap() {
+    return op_meta_info_map_;
+  }
+
+  void MergeOpMetaInfoMap(const std::unordered_map<
+                          std::string, std::vector<paddle::OpMetaInfo>>& map) {
+    op_meta_info_map_.insert(map.begin(), map.end());
+  }
+
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  GetCustomEdgesSlotMap() {
+    return custom_edges_slot_map_;
+  }
+
  private:
   Controller() = default;
   static Controller* controller_;
@@ -77,6 +92,11 @@ class Controller {
       new paddle::imperative::Tracer()};
   // TODO(jiabin): remove when we don't need imperative.
   bool in_eager_mode_{false};
+  std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
+      op_meta_info_map_;
+  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+      custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
 
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 77c39d1b0a37c3946e4c170484118a5fb6f79170..b485beca57a214bc00cb813e9de6a53eca1e67ea 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -30,7 +30,8 @@ namespace egr_utils_api {
 
 bool IsLeafTensor(const paddle::experimental::Tensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
-  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+  if (!grad_node ||
+      std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
     return true;
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index dc79a8a45a246798551a0bcce8c487f67183220b..b8d59e8dd8b4c60e28323955effd232eb2b51945 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
-static bool IgnoreGradAttribute(const std::string& op_type,
-                                const std::string& attr_name) {
-  // Attributes in operators_with_attrs are created manually during code
-  // generation
-  // We should ignore these arbitrary attrs when setting up grad attribute map
-  if (operators_with_attrs.count(op_type)) {
-    if (operators_with_attrs[op_type].count(attr_name)) {
-      return true;
-    }
-  }
+static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type,
+                                               const std::string& attrs_name) {
+  std::string additional_grad_attrs_str = "";
+
+  if (fwd_op_type == "sum") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
+    additional_grad_attrs_str = paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
+
+  } else if (fwd_op_type == "scale") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
 
-  // Only allow SumOp
-  if (op_type != "sum") {
-    return true;
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
   }
 
-  return false;
+  return additional_grad_attrs_str;
 }
 
 static void PrepareAttrMapForOps() {
@@ -973,7 +979,9 @@ static bool CollectGradInformationFromOpInfo(
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    const std::string& trace_op_body_str,
+    std::map<std::string, std::string> inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -992,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
-  std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_input_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_output_autograd_meta_str = "";
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
@@ -1000,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
+          "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_output_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
     } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+      // In inplace op, the case where output is duplicable is not considered.
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        auto inplace_input_name = inplace_map[output_name];
+        const std::string& inplace_input_autograd_name =
+            "p_autograd_" + inplace_input_name;
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    %s = egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name,
+            inplace_input_name);
+      } else {
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    egr::AutogradMeta* %s = "
+            "egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str +=
+            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
+                                    output_autograd_name, output_name);
+      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
 
+  // input autograd_meta should be got before running TraceOP (for checking
+  // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent(
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
+  // check inplace input to avoid inplace operations on leaf nodes with
+  // stop_gradient=False.
+  std::string check_inplace_str = "";
+  if (!inplace_map.empty()) {
+    const char* CHECKING_INPLACE_TEMPLATE =
+        "  // Check Inplace\n"
+        "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
+        "require_any_grad);\n";
+    for (auto& inplace_pair : inplace_map) {
+      std::string inplace_name = inplace_pair.second;
+      check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
+                                                   inplace_name, inplace_name);
+    }
+    VLOG(6) << "Check Inplace Input";
+  }
+
   std::string prepare_autograd_meta_str = "";
-  prepare_autograd_meta_str += get_autograd_meta_str;
+  // only generate input autograd_meta in temporary.
+  // output autograd_meta will be generated after running TraceOP.
+  prepare_autograd_meta_str += get_input_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
 
   // [GradOpNode] GetTraceBackward
@@ -1060,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent(
   size_t bwd_in_slot_num = out_vars.size();
   size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
-      "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
+      "      auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str += paddle::string::Sprintf(
       GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num);
@@ -1069,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent(
   VLOG(6) << "Generated GradOpNode construction";
 
   // [GradOpNode] Set Attrs
-  grad_node_creation_str += "    // Set Attributes\n";
-  grad_node_creation_str += "    grad_node->SetAttrMap(std::move(attrs));\n";
+  grad_node_creation_str += "      // Set Attributes\n";
+  grad_node_creation_str += "      grad_node->SetAttrMap(std::move(attrs));\n";
   grad_node_creation_str +=
-      "    grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
+      "      grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
   grad_node_creation_str += "\n";
 
   // [GradOpNode] Set TensorWrappers
-  grad_node_creation_str += "    // Set Tensor Wrappers\n";
+  grad_node_creation_str += "      // Set Tensor Wrappers\n";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -1088,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent(
         full_reserved = "true";
       }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "    grad_node->SetTensorWrapper%s(%s, %s);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name,
-          full_reserved);
+          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            inplace_input_name, full_reserved);
+      } else {
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            tensor_wrapper_name, full_reserved);
+      }
     }
   }
   grad_node_creation_str += "\n";
@@ -1109,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
       const char* ADD_EDGES_TEMPLATE =
-          "    if(%s) grad_node->AddEdges(%s, %d);\n";
+          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
           paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
                                   input_autograd_name, input_position);
@@ -1123,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(&%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
-      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(&%s, %d);\n";
+      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
     }
@@ -1139,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent(
   std::string pass_stop_gradient_args = "false";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-    size_t output_position = fwd_outputs_name_pos_map.at(output_name);
-
-    // Intermediate Tensor does not require SetHistory, nor RetainGrad
-
-    if (output.duplicable()) {
-      pass_stop_gradient_args += ", &" + output_autograd_name;
+    // Replace output directly with input in inplace op.
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      auto inplace_input_name = inplace_map[output_name];
+      const std::string& inplace_input_autograd_name =
+          "p_autograd_" + inplace_input_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
+
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+      pass_stop_gradient_args += ", " + inplace_input_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+          "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+          SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position);
 
       // Intermediate Tensor does not require SetHistory
       if (!output.intermediate()) {
         const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_HISTORY_TEMPLATE, inplace_input_autograd_name);
       }
       const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(&%s, %d);\n";
+          "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
 
+      // Intermediate Tensor does not require CheckAndRetainGrad
+      if (!output.intermediate()) {
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+      }
     } else {
-      pass_stop_gradient_args += ", " + output_autograd_name;
-      const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
-      // Intermediate Tensor does not require SetHistory
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
+      if (output.duplicable()) {
+        pass_stop_gradient_args += ", &" + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+
+      } else {
+        pass_stop_gradient_args += ", " + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+      }
+
+      // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
-        const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
         grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
       }
-      const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
-    }
-
-    // Intermediate Tensor does not require CheckAndRetainGrad
-    if (!output.intermediate()) {
-      VLOG(6) << "Generated Call RetainGradForTensor";
-      const char* RETAIN_GRAD_TEMPLATE =
-          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
     }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
   // [Generation] GradNode Creation
+  // After getting require_any_grad, firstly use CheckInplace method for inplace
+  // op.
+  // Then execute TraceOp and generate output autograd_meta.
+  // Finally, Construct GradNode. (Replace output directly with input in inplace
+  // op.)
+  // Add event record
+  std::string event_name = op_type + " node_creation";
   const char* GRAD_NODE_CREATION_TEMPLATE =
-      "  %s"
+      "%s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
-      "  if(require_any_grad) {\n"
-      "    VLOG(6) << \" Construct Grad for %s \"; \n"
-      "    egr::EagerUtils::PassStopGradient(%s);\n"
-      "%s\n  }";
+      "%s\n"
+      "%s"
+      "  {\n"
+      "    paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);\n"
+      "%s"
+      "    if(require_any_grad) {\n"
+      "      VLOG(6) << \" Construct Grad for %s \"; \n"
+      "      egr::EagerUtils::PassStopGradient(%s);\n"
+      "  %s\n"
+      "    }\n"
+      "  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, op_type, pass_stop_gradient_args,
-      grad_node_creation_str);
+      compute_require_grad_args, check_inplace_str, trace_op_body_str,
+      event_name, get_output_autograd_meta_str, op_type,
+      pass_stop_gradient_args, grad_node_creation_str);
 
   return grad_node_creation_body_str;
 }
@@ -1215,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent(
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    std::map<std::string, std::string> inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1295,8 +1400,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
-      const char* FWD_INS_ARG_TEMPLATE =
-          "const paddle::experimental::Tensor& %s";
+      // inplace tensor can't be const
+      const char* FWD_INS_ARG_TEMPLATE;
+      bool flag_find_input_name = false;
+      if (!inplace_map.empty()) {
+        for (auto& inplace_pair : inplace_map) {
+          if (inplace_pair.second == input_name) {
+            flag_find_input_name = true;
+            FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
+            break;
+          }
+        }
+      }
+      if (!flag_find_input_name) {
+        FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
+      }
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
 
@@ -1356,6 +1474,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
+  std::string inplace_mapping_str = "";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
@@ -1398,6 +1517,22 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_var_name);
 
+    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // In inplace op, replace the output with the input directly.
+      PADDLE_ENFORCE_NE(
+          inplace_map[output_name], "",
+          paddle::platform::errors::InvalidArgument(
+              "Inplace op %s has no input corresponding to output %s.", op_type,
+              output_name));
+      const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+      auto inplace_input_name = inplace_map[output_name];
+      outs_contents_str += paddle::string::Sprintf(
+          FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
+
+      // inplace_map used in TraceOp.
+      const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)";
+      inplace_mapping_str += paddle::string::Sprintf(
+          INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name);
     } else {
       if (output.duplicable()) {
         outnum = output_name + "Num";
@@ -1424,6 +1559,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   }
   if (outs_contents_str.size() > 0)
     outs_contents_str.pop_back();  // Remove trailing ","
+  if (inplace_mapping_str.size() > 0)
+    inplace_mapping_str.pop_back();  // Remove trailing ","
 
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
@@ -1457,6 +1594,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
 
+  /* --------- Generate TraceOp ----- */
+  // TraceOp should be run after compute require_any_grad. (for checking
+  // inplace)
+  // `trace_op_body_str` will be passed as a parameter to
+  // `GenerateGradNodeCreationContent`.
+  std::string trace_op_body_str = "";
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
       "  paddle::framework::AttributeMap attrs = attr_map;\n"
@@ -1464,11 +1607,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
       "outs, attrs, \n"
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
-      "     &default_attrs, true, {});\n";
-  std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
-  generated_function_body += trace_op_str;
-  generated_function_body += "\n";
+      "     &default_attrs, true, {%s});\n";
+  std::string trace_op_str = paddle::string::Sprintf(
+      FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str);
+
+  trace_op_body_str += trace_op_str;
+  trace_op_body_str += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
@@ -1533,34 +1677,64 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        const char* FWD_OUT_TENSOR_TEMPLATE =
-            "  paddle::experimental::Tensor %s;\n"
-            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
-        out_tensor_str =
-            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
-                                    output_name, output_varname);
+        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+          // Modify meta info of inplace tensor.
+          // Bump inplace version of inplace tensor.
+          auto inplace_input_name = inplace_map[output_name];
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n"
+              "  %s.bump_inplace_version();\n"
+              "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
+              "Strategy.\";\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
+              inplace_input_name, inplace_input_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  paddle::experimental::Tensor %s;\n"
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                      output_name, output_varname);
+        }
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    return_contents[return_position] = output_varname;
-    generated_function_body += out_tensor_str;
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // Replace output directly with input in inplace op.
+      return_contents[return_position] = inplace_map[output_name];
+    } else {
+      return_contents[return_position] = output_varname;
+    }
+    trace_op_body_str += out_tensor_str;
   }
-  generated_function_body += "\n";
+  trace_op_body_str += "\n";
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
+  /* ------ END Generate TraceOp ----- */
 
   // [Generation] Handle core_ops_returns_info
-  core_ops_returns_info[op_type] = return_contents;
+  // avoid inplace op changing core_ops_returns_info
+  if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) {
+    core_ops_returns_info[op_type] = return_contents;
+  }
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
+
   if (!bwd_info.GenerateForwardOnly()) {
-    std::string grad_node_creation_body_str =
-        GenerateGradNodeCreationContent(fwd_info, bwd_info);
+    // If GradNode needs to be generated, pass `trace_op_body_str`
+    // into `GenerateGradNodeCreationContent`.
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
 
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
+  } else {
+    // If GradNode doesn't need to be generated, generate TraceOP directly.
+    generated_function_body += trace_op_body_str;
   }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
@@ -1607,17 +1781,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated return codes";
 
   // [Generation] Get Full Function
-  std::string function_name = op_type + "_dygraph_function";
+  std::string function_name;
+  if (inplace_map.empty()) {
+    function_name = op_type + "_dygraph_function";
+  } else {
+    // change function_name for inplace op.
+    function_name = op_type + "__dygraph_function";
+  }
 
   if (dygraph_function_args_str.size() > 0) {
     auto iter = dygraph_function_args_str.begin();
     if ((*iter) == ',') dygraph_function_args_str.erase(iter);
   }
 
-  const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
+  const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
+      "  paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);";
+  std::string event_name = op_type + " dygraph";
+  std::string fwd_record_event_str = paddle::string::Sprintf(
+      DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
+  const char* FWD_FUNCTION_TEMPLATE =
+      "%s %s(%s) {\n\n"
+      "%s\n"
+      "%s\n"
+      "}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str, generated_function_body);
+      dygraph_function_args_str, fwd_record_event_str, generated_function_body);
 
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
@@ -1804,7 +1994,7 @@ static std::string GenerateSingleOpBase(
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
               "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].Size() ) },";
+              "this->OutputMeta()[%d].size() ) },";
           outs_contents_str += paddle::string::Sprintf(
               GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
         } else {
@@ -1842,18 +2032,9 @@ static std::string GenerateSingleOpBase(
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
       paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
-  for (const auto& iter : grad_attrs) {
-    if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue;
-    std::pair<std::string, std::string> type_val =
-        GetAttrType(iter.second, false /*is_arg*/);
-    const char* GRAD_ATTRS_TEMPLATE =
-        "  %s %s = %s;\n"
-        "  %s[\"%s\"] = %s;\n";
-    std::string var_name = iter.first + std::to_string(*outs_size);
-    grad_attrs_str += paddle::string::Sprintf(
-        GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second,
-        attrs_name, iter.first, var_name);
-  }
+
+  // Handle dynamic grad attributes
+  grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
   generated_grad_function_body += grad_attrs_str;
 
   const char* TRACE_OP_TEMPLATE =
@@ -2032,7 +2213,7 @@ static std::string GenerateGradNodeCCContents(
 
   if (is_op_base_per_duplicable_input) {
     const char* OP_BASE_PER_DUP_INPUT_TEMPLATE =
-        "  for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n"
+        "  for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n"
         "    %s\n"
         "  }\n";
     generated_grad_function_body = paddle::string::Sprintf(
@@ -2044,6 +2225,8 @@ static std::string GenerateGradNodeCCContents(
       "GradNode%s::ApplyGradientHooks(grads);\n"
       "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
       "  %s\n"
+      "  if(NeedComplexToRealConversion()) "
+      "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
   generated_grad_function_body =
       paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
@@ -2053,7 +2236,8 @@ static std::string GenerateGradNodeCCContents(
   const char* GRAD_FUNCTION_TEMPLATE =
       "std::vector<std::vector<paddle::experimental::Tensor>> "
       "GradNode%s::operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
+      "bool create_graph) {\n%s\n}";
   std::string grad_function_str = paddle::string::Sprintf(
       GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
@@ -2088,18 +2272,28 @@ static std::string GenerateGradNodeHeaderContents(
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
+      "bool create_graph = false) "
       "override;\n"
       "\n"
+      "  void ClearTensorWrappers() override { \n"
+      "%s\n"
+      "    is_tensor_wrappers_cleared = true;\n"
+      "  }\n"
       "  std::string name() override { return \" GradNode%s \"; } \n "
       "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  bool IsTensorWrappersCleared() override { \n"
+      "    return is_tensor_wrappers_cleared;\n"
+      "  }\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
+      "   bool is_tensor_wrappers_cleared = false;\n"
+      "\n"
       "   // Attribute Map\n"
       "%s\n"
       "};";
@@ -2133,6 +2327,7 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string set_tensor_wrappers_str = "";
   std::string tensor_wrapper_members_str = "";
+  std::string clear_tensor_wrappers_str = "";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -2164,6 +2359,13 @@ static std::string GenerateGradNodeHeaderContents(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
             struct_tensor_wrapper_name);
 
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
+            "for (auto tw: %s)   {\n"
+            "       tw.clear();\n"
+            "     }\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
+
       } else {
         const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
             "const paddle::experimental::Tensor& %s";
@@ -2176,10 +2378,14 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
+            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
             tensor_wrapper_name, full_reserved_str);
+
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
       std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
@@ -2194,8 +2400,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
-      tensor_wrapper_members_str, attr_members_str);
+      op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
+      set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
@@ -2240,8 +2446,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2379,7 +2586,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* --------------------------- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
-        GenerateForwardFunctionContents(fwd_info, bwd_info);
+        GenerateForwardFunctionContents(fwd_info, bwd_info, {});
 
     fwd_function_str += body_and_declaration.first + "\n";
 
@@ -2387,6 +2594,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // Inplace Function Generator.
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
+      std::pair<std::string, std::string> inplace_body_and_declaration =
+          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+
+      fwd_function_str += inplace_body_and_declaration.first + "\n";
+
+      VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------";
+      std::string inplace_fwd_function_declare_str =
+          inplace_body_and_declaration.second;
+      dygraph_forward_api_str += inplace_fwd_function_declare_str;
+    }
+
     if (bwd_info.GenerateForwardOnly()) continue;
 
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 53af6c1048d2454b1e9f375b837103930026ae54..771351dd4affbb355748c275a59681a6d5ba5577 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen
 
 set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h")
 set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h")
+
 add_custom_target(eager_final_state_python_c_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 967891fe5227dcd6129c0ef1808fba7720711568..1685b6f3cb5c3cc2ecbb1b773b9708703b3746c4 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -28,6 +28,7 @@ namespace = ""
 yaml_types_mapping = {
     'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+    'str' : 'std::string', \
     'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
@@ -148,6 +149,12 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def RemoveSpecialSymbolsInName(string):
+    # Remove any name after '@'
+    ret = string.split("@")[0]
+    return ret
+
+
 def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
     # intermediate_outputs : [name0, name1, ...]
     # forward_returns_list : [[ret_name, type, orig_pos], ...]
@@ -166,15 +173,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
 
 def ParseDispensable(string):
     # string: "X, Y"
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseIntermediate(string):
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseNoNeedBuffer(string):
     # string: "x, y"
+    string = RemoveSpecialSymbolsInName(string)
+
     no_need_buffer_set = set()
     for name in string.split(","):
         no_need_buffer_set.add(name.strip())
@@ -202,8 +213,11 @@ def ParseYamlArgs(string):
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
 
-        assert arg_type in yaml_types_mapping.keys()
+        assert arg_type in yaml_types_mapping.keys(
+        ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
         arg_type = yaml_types_mapping[arg_type]
+
+        arg_name = RemoveSpecialSymbolsInName(arg_name)
         if "Tensor" in arg_type:
             assert default_value is None
             inputs_list.append([arg_name, arg_type, i])
@@ -235,10 +249,12 @@ def ParseYamlReturns(string):
         else:
             ret_type = ret.strip()
 
-        assert ret_type in yaml_types_mapping.keys()
+        assert ret_type in yaml_types_mapping.keys(
+        ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
         ret_type = yaml_types_mapping[ret_type]
 
         assert "Tensor" in ret_type
+        ret_name = RemoveSpecialSymbolsInName(ret_name)
         returns_list.append([ret_name, ret_type, i])
 
     return returns_list
@@ -462,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     # SetTensorWrapper Methods & TensorWrapper Members
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
+    clear_tensor_wrapper_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
         if tname in no_need_buffer_set:
             no_need_buffer = "true"
@@ -483,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   {}.clear();
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
         else:
             assert IsVectorTensorType(ttype)
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
@@ -500,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   for (auto tw: {}) {
+     tw.clear();
+   };
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
     # End: SetTensorWrapper Methods & TensorWrapper Members
 
     # SetAttributes & Attribute Members
@@ -508,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     for aname, atype, default_val, _ in backward_attrs_list:
         saved_attr_name = GetSavedName(aname)
         SET_ATTR_METHOD_TEMPLATE = """
-   void SetAttribute{}({} {}) {{     
+   void SetAttribute{}({} {}) {{
      {} = {};
    }}
 """
@@ -539,25 +572,37 @@ class {} : public egr::GradNodeBase {{
   ~{}() override = default;
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
   std::string name() override {{ return \" {} \"; }}
+  
+  void ClearTensorWrappers() override {{
+      {}
+    is_tensor_wrappers_cleared = true;
+  }}
+  
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
   {}
+
+  bool IsTensorWrappersCleared() override {{
+      return is_tensor_wrappers_cleared;  
+  }}
  private:
   // TensorWrappers
   {}
 
+  bool is_tensor_wrappers_cleared = false;
+
   // Attributes
   {}
 }};
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        grad_node_name, set_tensor_wrapper_methods_str,
-        set_attribute_methods_str, tensor_wrapper_members_str,
-        attribute_members_str)
+        grad_node_name, clear_tensor_wrapper_str,
+        set_tensor_wrapper_methods_str, set_attribute_methods_str,
+        tensor_wrapper_members_str, attribute_members_str)
 
     return node_declaration_str
 
@@ -611,6 +656,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
         else:
             # Rearrange output order accordingly
             returns_str += f"returns[{fwd_position}] =  grad_api_returns[{grad_api_position}];\n"
+    returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
@@ -621,7 +667,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
         grad_api_namespace = f"paddle::experimental"
 
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
+std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
     // Call grad_api function
     auto grad_api_returns = {}::{}({});
     {}
@@ -684,7 +730,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
@@ -721,8 +767,11 @@ def GenerateNodeCreationCodes(
             else:
                 set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
-            if IsVectorTensorType(atype):
-                tw_name = f"api_result[{pos}]"
+            if num_fwd_outputs > 1:
+                # Aligned with forward output position
+                assert name in forward_outputs_position_map.keys()
+                fwd_output_pos = forward_outputs_position_map[name][1]
+                tw_name = f"std::get<{fwd_output_pos}>(api_result)"
             else:
                 tw_name = f"api_result"
 
@@ -738,7 +787,7 @@ def GenerateNodeCreationCodes(
     set_edges_list = []
     for name, (_, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
-        set_grad_out_meta = f"        grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});"
+        set_grad_out_meta = f"        grad_node->SetGradOutMeta({name}, {pos});"
         set_edges = f"        grad_node->AddEdges({input_autograd_meta_name}, {pos});"
         set_grad_out_meta_list.append(set_grad_out_meta)
         set_edges_list.append(set_edges)
@@ -755,17 +804,18 @@ def GenerateNodeCreationCodes(
         output_autograd_meta_name = GetAutoGradMetaName(name)
         set_out_rank = f"        egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
         set_history = f"        egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
-        set_grad_in_meta = f"        grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});"
+        if num_outputs == 1:
+            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
+            set_grad_in_meta = f"        grad_node->SetGradInMeta(api_result, {pos});"
+        else:
+            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
+            set_grad_in_meta = f"        grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
 
         set_out_rank_list.append(set_out_rank)
         set_history_list.append(set_history)
         set_grad_in_meta_list.append(set_grad_in_meta)
-
-        if num_outputs == 1:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
-        else:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);"
         set_retain_grad_list.append(set_retain_grad)
+
     set_out_rank_str = "\n".join(set_out_rank_list)
     set_history_str = "\n".join(set_history_list)
     set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
@@ -887,7 +937,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
             returns_list[0] = f"api_result"
         else:
             # Tuple api_result
-            returns_list[pos] = f"api_result[{pos}]"
+            returns_list[pos] = f"std::get<{pos}>(api_result)"
 
         if IsPlainTensorType(rtype):
             returns_type_list[pos] = "paddle::experimental::Tensor"
@@ -910,8 +960,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         backward_fwd_input_map, backward_grad_input_map,
         backward_grad_output_map, backward_attrs_list, optional_inputs)
 
+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """{{\n
+           paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+           {}\n
+        }}"""
+    node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
+                                                      node_creation_str)
+
+    dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
+
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
+    {}
+
     // Forward API Call
     {}
     
@@ -925,7 +987,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
         returns_type_str, forward_function_name, inputs_args_definition_str,
-        forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, forward_call_str, node_creation_str, returns_str)
     forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
 
     return forward_function_str, forward_function_declaration_str
@@ -1025,7 +1087,7 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/backward/sparse_bw_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1052,6 +1114,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str):
 
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
 """
 
     file_contents += GenerateCoreOpInfoDefinition()
@@ -1218,7 +1282,7 @@ if __name__ == "__main__":
             # Node Definition Generation
             definition_declaration_pair = GenerateForwardDefinition(
                 fwd_api_name, bwd_api_name, forward_inputs_position_map,
-                forward_outputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, orig_forward_attrs_list,
                 backward_fwd_input_map, backward_grad_input_map,
                 backward_grad_output_map, backward_attrs_list, optional_inputs,
                 intermediate_outputs)
@@ -1230,7 +1294,7 @@ if __name__ == "__main__":
             # For python-level API dispatch
             CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
                                       forward_outputs_position_map,
-                                      forward_attrs_list)
+                                      orig_forward_attrs_list)
 
         if len(namespace) > 0:
             forward_definition_str += f"""namespace {namespace} {{
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index eee32a2c5057d523212a4faa5eca8678e961f417..e1c2cf871ea4276d6423c8ed9c62076d13a024ec 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,9 +14,18 @@
 
 import os
 import argparse
+import logging
 from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
-skipped_fwd_api_names = set(["scale"])
+###########################
+## Global Configurations ##
+###########################
+skipped_forward_api_names = set(["scale"])
+
+
+def SkipAPIGeneration(forward_api_name):
+    return (forward_api_name in skipped_forward_api_names)
+
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -24,7 +33,7 @@ atype_to_parsing_function = {
     "long": "CastPyArg2Long",
     "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
-    "string": "CastPyArg2String",
+    "std::string": "CastPyArg2String",
     "std::vector<bool>": "CastPyArg2Booleans",
     "std::vector<int>": "CastPyArg2Ints",
     "std::vector<long>": "CastPyArg2Longs",
@@ -39,64 +48,35 @@ atype_to_parsing_function = {
 }
 
 
-def ParseArguments():
-    parser = argparse.ArgumentParser(
-        description='Eager Code Generator Args Parser')
-    parser.add_argument('--api_yaml_path', type=str)
-    parser.add_argument('--output_path', type=str)
-
-    args = parser.parse_args()
-    return args
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
-        print(f"Unable to find {atype} in atype_to_parsing_function.")
-        assert False
+        assert False, f"Unable to find {atype} in atype_to_parsing_function."
 
     return atype_to_parsing_function[atype]
 
 
-def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
-                            forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs, is_forward_only):
-    # forward_inputs_position_map = { "name" : [type, fwd_position] }
-    # forward_outputs_position_map = { "name" : [type, fwd_position] }
-    # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
-    # optional_inputs = [name0, ...]
-
-    # Get EagerTensor from args
-    # Get dygraph function call args
-    num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list)
-    num_input_tensors = len(forward_inputs_position_map.keys())
-    dygraph_function_call_list = ["" for i in range(num_args)]
-    get_eager_tensor_str = ""
-    for name, (ttype, pos) in forward_inputs_position_map.items():
-        is_optional = (name in optional_inputs)
-        if IsVectorTensorType(ttype):
-            get_eager_tensor_str += f"    auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        else:
-            if is_optional:
-                get_eager_tensor_str += f"    auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-            else:
-                get_eager_tensor_str += f"    auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        dygraph_function_call_list[pos] = f"{name}"
+##########################
+## Refactored Functions ##
+##########################
+PARSE_PYTHON_C_TENSORS_TEMPLATE = \
+"    auto {} = {}(\"{}\", \"{}\", args, {}, false);\n"
+
 
-    parse_attributes_str = ""
-    # Get Attributes
-    for name, atype, _, pos in forward_attrs_list:
-        parsing_function = FindParsingFunctionFromAttributeType(atype)
-        key = f"{name}"
+PARSE_PYTHON_C_ARGS_TEMPLATE = \
+"""    PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n
+     {} {} = {}({}_obj, \"{}\", {});\n"""
 
-        parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
-        dygraph_function_call_list[pos] = f"{name}"
-    dygraph_function_call_str = ",".join(dygraph_function_call_list)
+RECORD_EVENT_TEMPLATE = \
+"    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
 
-    PYTHON_C_FUNCTION_TEMPLATE = """
+
+PYTHON_C_FUNCTION_TEMPLATE = \
+"""
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
 {{
+  {}
+
   PyThreadState *tstate = nullptr;
   try
   {{
@@ -126,26 +106,50 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
 }}
 
 """
-    namespace_str = ""
-    if len(namespace) > 0:
-        namespace_str = f"{namespace}::"
 
-    if is_forward_only:
-        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
-    else:
-        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
 
-    python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-        fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        fwd_function_name, dygraph_function_call_str)
+FUNCTION_NAME_TEMPLATE = \
+"{}{}{}"
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
-    return python_c_function_str, python_c_function_reg_str
+PYTHON_C_FUNCTION_REG_TEMPLATE = \
+"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}"
 
 
-def GenerateCoreOpsInfoMap():
-    result = """
+PYTHON_C_WRAPPER_TEMPLATE = \
+"""
+#pragma once
+
+#include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
+#include  "paddle/fluid/pybind/op_function_common.h"
+#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include  "paddle/fluid/pybind/exception.h"
+#include  "paddle/fluid/platform/profiler/event_tracing.h"
+#include  <Python.h>
+
+namespace paddle {{
+namespace pybind {{
+
+{}
+
+static PyMethodDef EagerFinalStateMethods[] = {{
+    {}
+}};
+
+}} // namespace pybind
+}} // namespace paddle
+"""
+
+
+CORE_OPS_INFO = \
+"""
 static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
@@ -190,9 +194,11 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
       return nullptr;
     }
 }
-    """
+"""
+
 
-    core_ops_infos_registry = """
+CORE_OPS_INFO_REGISTRY = \
+"""
     {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
@@ -205,7 +211,259 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"},
 """
 
-    return result, core_ops_infos_registry
+NAMESPACE_WRAPPER_TEMPLATE = \
+"""namespace {} {{
+    {}
+}}
+"""
+
+
+#######################
+## Generator Classes ##
+#######################
+class PythonCSingleFunctionGenerator:
+    def __init__(self, fwd_api_contents, namespace):
+        self.fwd_api_contents = fwd_api_contents
+        self.namespace = namespace
+
+        # Raw Contents
+        self.forward_api_name = ""
+        self.forward_args_str = ""
+        self.forward_returns_str = ""
+
+        # Raw Data
+        self.forward_attrs_list = None  #[ [attr_name, attr_type, default_value, orig_position], ...]
+        self.forward_inputs_list = None  #[ [arg_name, arg_type, orig_position], ...]
+        self.forward_returns_list = None  #[ [ret_name, ret_type, orig_position], ...]
+
+        # Processed Data
+        self.forward_inputs_position_map = None  #{ "name" : [type, fwd_position] }
+        self.forward_outputs_position_map = None  #{ "name" : [type, fwd_position] }
+
+        # Special Op Attributes
+        self.optional_inputs = []  #[name, ...]
+        self.is_forward_only = True
+
+        # Generated Results
+        self.python_c_function_str = ""
+        self.python_c_function_reg_str = ""
+
+    def CollectRawContents(self):
+        fwd_api_contents = self.fwd_api_contents
+
+        assert 'api' in fwd_api_contents.keys(
+        ), "Unable to find \"api\" in fwd_api_contents keys"
+        assert 'args' in fwd_api_contents.keys(
+        ), "Unable to find \"args\" in fwd_api_contents keys"
+        assert 'output' in fwd_api_contents.keys(
+        ), "Unable to find \"output\" in fwd_api_contents keys"
+
+        self.forward_api_name = fwd_api_contents['api']
+        self.forward_args_str = fwd_api_contents['args']
+        self.forward_returns_str = fwd_api_contents['output']
+
+    def CollectIsForwardOnly(self):
+        fwd_api_contents = self.fwd_api_contents
+        self.is_forward_only = False if 'backward' in fwd_api_contents.keys(
+        ) else True
+
+    def CollectOptionalInputs(self):
+        fwd_api_contents = self.fwd_api_contents
+        if 'optional' in fwd_api_contents.keys():
+            self.optional_inputs = ParseDispensable(fwd_api_contents[
+                'optional'])
+
+    def CollectForwardInOutAttr(self):
+        forward_args_str = self.forward_args_str
+        forward_returns_str = self.forward_returns_str
+
+        self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
+            forward_args_str, forward_returns_str)
+
+    def CollectForwardPositionMap(self):
+        forward_inputs_list = self.forward_inputs_list
+        forward_returns_list = self.forward_returns_list
+
+        self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
+            forward_inputs_list, forward_returns_list)
+
+    def GeneratePythonCFunction(self):
+        namespace = self.namespace
+        forward_api_name = self.forward_api_name
+        forward_attrs_list = self.forward_attrs_list
+        forward_inputs_position_map = self.forward_inputs_position_map
+        forward_outputs_position_map = self.forward_outputs_position_map
+        optional_inputs = self.optional_inputs
+        is_forward_only = self.is_forward_only
+
+        # Generate Python-C Tensors Parsing Logic
+        get_eager_tensor_str = ""
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            is_optional = (name in optional_inputs)
+            if IsVectorTensorType(ttype):
+                get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                    name, "GetTensorListFromArgs", forward_api_name, name, pos)
+            else:
+                if is_optional:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetOptionalTensorFromArgs", forward_api_name,
+                        name, pos)
+                else:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetTensorFromArgs", forward_api_name, name, pos)
+
+        parse_attributes_str = ""
+
+        # Generate Python-C Attributes Parsing Logic
+        for name, atype, _, pos in forward_attrs_list:
+            parsing_function_name = FindParsingFunctionFromAttributeType(atype)
+            parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
+                name, pos, atype, name, parsing_function_name, name,
+                forward_api_name, pos)
+
+        # Generate Dygraph Function Call Logic
+        num_args = len(forward_inputs_position_map.keys()) + len(
+            forward_attrs_list)
+        dygraph_function_call_list = ["" for i in range(num_args)]
+        for name, (_, pos) in forward_inputs_position_map.items():
+            dygraph_function_call_list[pos] = f"{name}"
+        for name, _, _, pos in forward_attrs_list:
+            dygraph_function_call_list[pos] = f"{name}"
+        dygraph_function_call_str = ",".join(dygraph_function_call_list)
+
+        # Generate Python-C Function Definitions
+        if is_forward_only:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "paddle::experimental::", namespace, forward_api_name)
+        else:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "::", namespace, GetForwardFunctionName(forward_api_name))
+
+        # Generate Record Event for performance profiling
+        pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
+            "pythonc_record_event", forward_api_name, "pybind_imperative_func")
+        self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+            forward_api_name, pythonc_record_event_str, forward_api_name,
+            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
+            dygraph_function_call_str)
+
+        # Generate Python-C Function Registration
+        self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            forward_api_name, namespace, forward_api_name, forward_api_name)
+
+    def run(self):
+        # Initialized is_forward_only
+        self.CollectIsForwardOnly()
+
+        # Initialized forward_api_name, forward_args_str, forward_returns_str
+        self.CollectRawContents()
+        if SkipAPIGeneration(self.forward_api_name): return False
+
+        # Initialized optional_inputs
+        self.CollectOptionalInputs()
+
+        # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list
+        self.CollectForwardInOutAttr()
+        logging.info(
+            f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}")
+        logging.info(
+            f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}")
+        logging.info(
+            f"Parsed Original Forward Returns List: \n{self.forward_returns_list}"
+        )
+
+        # Initialized forward_inputs_position_map, forward_outputs_position_map
+        self.CollectForwardPositionMap()
+        logging.info(
+            f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
+        )
+        logging.info(
+            f"Generated Forward Output Position Map: {self.forward_outputs_position_map}"
+        )
+
+        # Code Generation
+        self.GeneratePythonCFunction()
+        logging.info(
+            f"Generated Python-C Function: {self.python_c_function_str}")
+        logging.info(
+            f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}"
+        )
+
+        return True
+
+
+class PythonCYamlGenerator:
+    def __init__(self, path):
+        self.yaml_path = path
+
+        self.namespace = ""
+        self.forward_api_list = []
+
+        # Generated Result
+        self.python_c_functions_reg_str = ""
+        self.python_c_functions_str = ""
+
+    def ParseYamlContents(self):
+        yaml_path = self.yaml_path
+        self.forward_api_list = ReadFwdFile(yaml_path)
+
+    def GeneratePythonCFunctions(self):
+        namespace = self.namespace
+        forward_api_list = self.forward_api_list
+
+        for forward_api_content in forward_api_list:
+            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
+                                                         namespace)
+            status = f_generator.run()
+
+            if status == True:
+                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
+                self.python_c_functions_str += f_generator.python_c_function_str + "\n"
+
+    def InferNameSpace(self):
+        yaml_path = self.yaml_path
+        if "sparse" in yaml_path:
+            self.namespace = "sparse::"
+
+    def AttachNamespace(self):
+        namespace = self.namespace
+        python_c_functions_str = self.python_c_functions_str
+
+        if namespace != "":
+            if namespace.endswith("::"):
+                namespace = namespace[:-2]
+            self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format(
+                namespace, python_c_functions_str)
+
+    def run(self):
+        # Infer namespace from yaml_path
+        self.InferNameSpace()
+
+        # Read Yaml file
+        self.ParseYamlContents()
+
+        # Code Generation
+        self.GeneratePythonCFunctions()
+
+        # Wrap with namespace
+        self.AttachNamespace()
+
+
+############################
+## Code Generation Helper ##
+############################
+def ParseArguments():
+    parser = argparse.ArgumentParser(
+        description='Eager Code Generator Args Parser')
+    parser.add_argument('--api_yaml_path', type=str)
+    parser.add_argument('--output_path', type=str)
+
+    args = parser.parse_args()
+    return args
+
+
+def GenerateCoreOpsInfoMap():
+    return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY
 
 
 def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
@@ -217,35 +475,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
     python_c_function_reg_str += core_ops_infos_registry
     python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}"
 
-    PYTHON_C_WRAPPER_TEMPLATE = """
-#pragma once
-
-#include  "pybind11/detail/common.h"
-#include  "paddle/phi/api/all.h"
-#include  "paddle/phi/api/lib/dygraph_api.h"
-#include  "paddle/phi/common/backend.h"
-#include  "paddle/phi/common/data_type.h"
-#include  "paddle/phi/common/scalar.h"
-#include  "paddle/phi/common/scalar_array.h"
-#include  "paddle/phi/api/include/sparse_api.h"
-#include  "paddle/fluid/pybind/op_function_common.h"
-#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include  "paddle/fluid/pybind/exception.h"
-#include  <Python.h>
-
-namespace paddle {{
-namespace pybind {{
-
-{}
-
-static PyMethodDef EagerFinalStateMethods[] = {{
-    {}
-}};
-
-}} // namespace pybind
-}} // namespace paddle
-
-"""
     python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str,
                                                     python_c_function_reg_str)
 
@@ -259,86 +488,23 @@ def GeneratePythonCFile(filepath, python_c_str):
 
 if __name__ == "__main__":
     args = ParseArguments()
-
     api_yaml_paths = args.api_yaml_path.split(",")
 
-    python_c_functions_reg_str = ""
-    python_c_functions_str = ""
-
+    generated_python_c_functions = ""
+    generated_python_c_registration = ""
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
 
-        if "sparse" in api_yaml_path:
-            namespace = "sparse"
-        else:
-            namespace = ""
-
-        fwd_api_list = ReadFwdFile(api_yaml_path)
-
-        python_c_function_list = []
-        python_c_function_reg_list = []
-        for fwd_api in fwd_api_list:
-
-            # We only generate Ops with grad
-            is_forward_only = False
-            if 'backward' not in fwd_api.keys():
-                is_forward_only = True
-
-            assert 'api' in fwd_api.keys()
-            assert 'args' in fwd_api.keys()
-            assert 'output' in fwd_api.keys()
-
-            fwd_api_name = fwd_api['api']
-            fwd_args_str = fwd_api['args']
-            fwd_returns_str = fwd_api['output']
-
-            if fwd_api_name in skipped_fwd_api_names:
-                continue
-
-            # Parse Dispensable Inputs
-            optional_inputs = []
-            if 'optional' in fwd_api.keys():
-                optional_inputs = ParseDispensable(fwd_api['optional'])
-
-            # Collect Original Forward Inputs/Outputs and then perform validation checks
-            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-                fwd_args_str, fwd_returns_str)
-            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-            print("Prased Original Forward Attrs List: ", forward_attrs_list)
-            print("Parsed Original Forward Returns List: ",
-                  forward_returns_list)
-
-            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-                forward_inputs_list, forward_returns_list)
-            print("Generated Forward Input Position Map: ",
-                  forward_inputs_position_map)
-            print("Generated Forward Output Position Map: ",
-                  forward_outputs_position_map)
-
-            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-                forward_outputs_position_map, optional_inputs, is_forward_only)
-            python_c_function_list.append(python_c_function_str)
-            python_c_function_reg_list.append(python_c_function_reg_str)
-            print("Generated Python-C Function: ", python_c_function_str)
-
-        # Append Namespace
-        python_c_functions_reg_str += ",\n".join(
-            python_c_function_reg_list) + ","
-        python_c_functions = "\n".join(python_c_function_list)
-        if len(namespace) > 0:
-            python_c_functions_str += f"""namespace {namespace} {{
-    {python_c_functions}
-}}
-"""
+        y_generator = PythonCYamlGenerator(api_yaml_path)
+        y_generator.run()
 
-        else:
-            python_c_functions_str += python_c_functions
+        generated_python_c_functions += y_generator.python_c_functions_str + "\n"
+        generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n"
 
-    python_c_str = GeneratePythonCWrappers(python_c_functions_str,
-                                           python_c_functions_reg_str)
+    python_c_str = GeneratePythonCWrappers(generated_python_c_functions,
+                                           generated_python_c_registration)
 
-    print("Generated Python-C Codes: ", python_c_str)
+    logging.info(f"Generated Python-C Codes: \n{python_c_str}")
 
     output_path = args.output_path
     for path in [output_path]:
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
   std::shared_ptr<paddle::experimental::Tensor> grad_{
-      std::make_shared<paddle::experimental::Tensor>(
-          egr::Controller::Instance().GenerateUniqueName("@grad"))};
+      std::make_shared<paddle::experimental::Tensor>()};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 934497d7d179c1732bde68c147ed86661c25ddae..17bc2441488aa3c4fc62a37e825eeb94cafea9bb 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -37,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   // Copy nodes
   std::queue<GradNodeBase*> queue = init_queue;
   std::unordered_set<GradNodeBase*> visited;
+  size_t potential_startup_ops_cnt = queue.size();
+  size_t cnt = 0;
 
   // Visit each node exactly once in any order
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
     queue.pop();
 
+    if (cnt < potential_startup_ops_cnt) {
+      if (!node_in_degree_map.count(node)) {
+        node_in_degree_map[node] = 0;
+      }
+      cnt += 1;
+    }
+
     if (visited.count(node)) {
       continue;
     }
@@ -74,20 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   return node_in_degree_map;
 }
 
-void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
-                 const std::vector<paddle::experimental::Tensor>& grad_tensors,
-                 bool retain_graph) {
+// Remove some nodes those doesn't need to be
+// stored in potential_stop_nodes、potential_startup_nodes
+void UpdateGraphInfo(
+    std::unordered_map<GradNodeBase*, AutogradMeta*>*
+        target_nodes_inputmeta_map,
+    std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>*
+        depending_nodes,
+    std::unordered_set<GradNodeBase*>* potential_stop_nodes,
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
+  // Updated potential_sotp_nodes by depending_nodes,
+  // make sure the path from root to target_node is ok
+  std::unordered_set<GradNodeBase*> _startup_ops;
+  VLOG(6) << "Running in UpdateGraphInfo";
+  std::queue<GradNodeBase*> queue;
+  for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
+    queue.emplace(target_nodes_inputmeta_pair.first);
+  }
+
+  while (!queue.empty()) {
+    auto* target_node = queue.front();
+    queue.pop();
+    if (!(*depending_nodes)[target_node].empty()) {
+      auto precedding_nodes = (*depending_nodes)[target_node];
+      for (auto pre_nodes : precedding_nodes) {
+        queue.emplace(pre_nodes);
+        if (potential_stop_nodes->find(pre_nodes) !=
+            potential_stop_nodes->end()) {
+          potential_stop_nodes->erase(pre_nodes);
+        }
+      }
+    } else {  // startup_ops have no precedding nodes
+      VLOG(6) << "Emplace _startup_ops";
+      _startup_ops.emplace(target_node);
+    }
+  }
+  // Purify potential_startup_nodes again, remove some
+  // potential startup_nodes that unreach to input target nodes
+  if (!_startup_ops.empty()) {
+    std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+    for (auto node : *potential_startup_nodes) {
+      if (_startup_ops.count(node) == 0) {
+        VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
+        potential_startup_nodes_to_be_erased.emplace(node);
+      }
+    }
+    if (!potential_startup_nodes_to_be_erased.empty()) {
+      for (auto node : potential_startup_nodes_to_be_erased) {
+        VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
+        potential_startup_nodes->erase(node);
+      }
+    }
+  }
+}
+
+// Get Graph Info Betweent input target gradnode and outputs，
+// record depending_nodes、 potential_stop_nodes、potential_startup_nodes
+void GetGraphInfoBetweenTargets(
+    const std::queue<GradNodeBase*>& init_queue,
+    std::unordered_map<GradNodeBase*, AutogradMeta*>*
+        input_target_nodes_inputmeta_map,
+    std::unordered_map</*child node*/ GradNodeBase*,
+                       /*father nodes*/ std::unordered_set<GradNodeBase*>>*
+        depending_nodes,
+    std::unordered_set<GradNodeBase*>* potential_stop_nodes,
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
+  if (input_target_nodes_inputmeta_map->empty()) return;
+
+  VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
+
+  // Calculate in_degree for each node
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+  // Copy nodes
+  std::queue<GradNodeBase*> queue = init_queue;
+  std::unordered_set<GradNodeBase*> visited;
+
+  // Visit each node exactly once in any order
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    if (visited.count(node)) {
+      continue;
+    }
+    visited.insert(node);
+
+    // Check node is target_nodes or not, if node is not target_node,
+    // all the next_node will be marked in potential_stop_nodes
+    bool is_potential_stop_nodes =
+        input_target_nodes_inputmeta_map->count(node);
+
+    // Find and append next nodes
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+    for (const auto& edge_list : edges) {
+      for (const Edge& edge : edge_list) {
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+        // Next node could be nullptr if it is leaf tensor with no
+        // AccumulationNode attached
+        // Or it could also originated from dispensable inputs
+        if (!next_node) continue;
+
+        // if node not in input_target_nodes,
+        // all the next_nodes of current node will be inserted to
+        // potential_stop_node
+        if (is_potential_stop_nodes) {
+          potential_stop_nodes->emplace(next_node);
+        }
+
+        // Update in_degree
+        if (!node_in_degree_map.count(next_node))
+          node_in_degree_map[next_node] = 0;
+        node_in_degree_map[next_node]++;
+
+        // Record depending relationship
+        (*depending_nodes)[next_node].emplace(node);
+        queue.push(next_node);
+      }
+    }
+  }
+  // Update Graph Info, remove some stop_node in potential_stop_nodes
+  UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
+                  potential_stop_nodes, potential_startup_nodes);
+}
+
+void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
+                        std::unordered_map<GradNodeBase*, AutogradMeta*>*
+                            target_nodes_inputmeta_map) {
+  VLOG(6) << "Running in GetTargetNodesInfo";
+  if (!inputs.empty()) {
+    VLOG(6) << "Inputs are not empty";
+    size_t num_inputs = inputs.size();
+    for (size_t i = 0; i < num_inputs; i++) {
+      AutogradMeta* auto_grad_meta =
+          EagerUtils::unsafe_autograd_meta(inputs[i]);
+      auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+      PADDLE_ENFORCE_NOT_NULL(target_node,
+                              paddle::platform::errors::Fatal(
+                                  "There is no grad op for input:%d or it's"
+                                  "stop_gradient=True",
+                                  i));
+      (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+    }
+  }
+}
+
+std::vector<paddle::experimental::Tensor> GetResults(
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
+        results_map,
+    bool allow_unused, bool create_graph) {
+  VLOG(6) << "Running in GetResults";
+  if (inputs.empty()) return {};
+
+  std::vector<paddle::experimental::Tensor> results;
+  results.reserve(inputs.size());
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto& input = inputs[i];
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
+    auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+    auto iter = results_map->find(target_node);
+    if (iter != results_map->end()) {
+      // set StopGradient = !create_graph
+      AutogradMeta* tensor_auto_grad_meta =
+          EagerUtils::autograd_meta(&(iter->second));
+      tensor_auto_grad_meta->SetStopGradient(!create_graph);
+      results.emplace_back(iter->second);
+    } else {
+      PADDLE_ENFORCE_EQ(allow_unused, true,
+                        paddle::platform::errors::InvalidArgument(
+                            "The %d-th input does not appear in the backward "
+                            "graph. Please check the input variable or set "
+                            "allow_unused=True to get None result.",
+                            i));
+      results.emplace_back();
+    }
+  }
+  return results;
+}
+
+// Enforce GradNode has TensorWrappers as Input
+void EnforceGradNodeHasInput(GradNodeBase* node) {
+  VLOG(6) << "Running in EnforceGradNodeHasInput";
+  PADDLE_ENFORCE_NE(
+      node->IsTensorWrappersCleared(), true,
+      paddle::platform::errors::Fatal(
+          "The TensorWrappers of %s do not exist. This may be because:\n"
+          "You calculate backward twice for the same subgraph without "
+          "setting retain_graph=True. Please set retain_graph=True in the "
+          "first backward/grad call.\n",
+          node->name()));
+}
+
+// Purify potential_startup_nodes, remove nodes those are the same as
+// input_target_nodes
+void PurifyPotentialStartUpNodes(
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes,
+    std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>*
+        input_target_nodes_inputmeta_map) {
+  VLOG(6) << "Running in PurifyPotentialStartUpNodes";
+  if (input_target_nodes_inputmeta_map->empty()) return;
+  std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+  for (auto startup_op : *potential_startup_nodes) {
+    auto iter = input_target_nodes_inputmeta_map->find(startup_op);
+    if (iter != input_target_nodes_inputmeta_map->end()) {
+      potential_startup_nodes_to_be_erased.emplace(iter->first);
+    }
+  }
+  if (!potential_startup_nodes_to_be_erased.empty()) {
+    for (auto nodes : potential_startup_nodes_to_be_erased) {
+      potential_startup_nodes->erase(nodes);
+    }
+  }
+}
+
+std::vector<paddle::experimental::Tensor> RunBackward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph = false,
+    const std::vector<paddle::experimental::Tensor>& inputs = {},
+    bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
   VLOG(6) << "Start Backward";
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
   // *Cross-batch accumulation happens at forward pass
 
+  std::unordered_map<GradNodeBase*, AutogradMeta*>
+      no_grad_var_nodes_inputmeta_map;
+  // Get no_grad_vars's GradNodes and InputMeta Info
+  GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
+
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
   // 2. Prepare initial input buffers
   std::queue<GradNodeBase*> queue;
   std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
       node_input_buffers_dict;
+  std::unordered_set<GradNodeBase*> potential_startup_nodes;
   for (size_t i = 0; i < tensors.size(); i++) {
     const paddle::experimental::Tensor& tensor = tensors[i];
 
@@ -112,7 +351,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
-      VLOG(6) << "Create Value for grad input tensor " << i;
+      VLOG(6) << "Create Value for grad input tensor " << i
+              << " of grad node: " << grad_node->name();
       node_input_buffers_dict[grad_node] =
           std::make_unique<GradTensorHolder>(grad_node->InputMeta());
     }
@@ -126,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
               "size = 0 or same size as tensors"));
       // Feed given tensor if it's provided
       VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
-      node_input_buffers_dict[grad_node]->add(
-          input_info.first, input_info.second, grad_tensors[i]);
+
+      if (grad_tensors[i].is_initialized()) {
+        // Deep copy
+        paddle::experimental::Tensor tmp_tensor;
+        tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true);
+        node_input_buffers_dict[grad_node]->add(input_info.first,
+                                                input_info.second, tmp_tensor);
+      } else {
+        node_input_buffers_dict[grad_node]->add(
+            input_info.first, input_info.second, grad_tensors[i]);
+      }
 
     } else {
       VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
@@ -140,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           input_info.first, input_info.second, tensor, true /*fill_one=true*/);
     }
 
-    // Prepare queue
+    // Prepare queue, potential startup_nodes
     queue.push(grad_node);
+    potential_startup_nodes.emplace(grad_node);
   }
 
   VLOG(6) << "Update In degree Map for backward";
@@ -149,37 +399,129 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
+  // Get input's GradNodes and InputMeta Info
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      input_target_nodes_inputmeta_map;
+  GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map);
+
+  // Purify potential_startup_ops, remove those nodes that are the same as
+  // input_target_nodes
+  PurifyPotentialStartUpNodes(&potential_startup_nodes,
+                              &input_target_nodes_inputmeta_map);
+
+  // Get Graph Info Betweent input target gradnode and outputs
+  // Record the depending_nodes and potential_stop_nodes
+  std::unordered_map<GradNodeBase* /* child node */,
+                     std::unordered_set<GradNodeBase*> /* father node */>
+      depending_nodes;
+  std::unordered_set<GradNodeBase*> potential_stop_nodes;
+  // std::unordered_set<GradNodeBase*> startup_ops;
+
+  GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
+                             &depending_nodes, &potential_stop_nodes,
+                             &potential_startup_nodes);
+
+  // ready_queue store all startup nodes
+  std::queue<GradNodeBase*> ready_queue;
+  // startup op's indegree should be 0
+  for (auto node : potential_startup_nodes) {
+    if (node_in_degree_map[node] == 0) {
+      ready_queue.emplace(node);
+    }
+  }
+
+  VLOG(1) << " startup_ops' size is :" << ready_queue.size();
+
+  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
+
+  // read_queue is empty only when 1.input equals to output. 2.input can not
+  // reach to output.
+  if (ready_queue.size() == 0) {
+    for (auto input_target_node : input_target_nodes_inputmeta_map) {
+      // out rank_info of forward op
+      auto rank_info = input_target_node.second->OutRankInfo();
+      if (node_input_buffers_dict[input_target_node.first]) {
+        auto& target_result =
+            node_input_buffers_dict[input_target_node.first]
+                ->Buffers()[rank_info.first][rank_info.second];
+        // save the target result
+        results_map[input_target_node.first] = target_result;
+      }
+    }
+  }
+
   /* --- Topological Visit --- */
   // 1. Pop queue
   // 2. Run node
+  //    |- Check and capture target result
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
   VLOG(6) << "Run Backward";
-  while (!queue.empty()) {
-    GradNodeBase* node = queue.front();
-    queue.pop();
+  while (!ready_queue.empty()) {
+    GradNodeBase* node = ready_queue.front();
+    VLOG(6) << "Running GradNode:" << node->name();
+    ready_queue.pop();
+
+    paddle::platform::RecordEvent node_record_event(
+        std::string(typeid(*node).name()) + " grad_node",
+        paddle::platform::TracerEventType::Operator, 1);
 
     // Run node: This is where Hook happens
     PADDLE_ENFORCE(
         node_input_buffers_dict.count(node),
         paddle::platform::errors::Fatal(
-            "Unable to find next node in the InputBuufer"
+            "Unable to find next node in the GradTensorHolder \n"
             "Trying to run Node without configuring its GradTensorHolder"));
 
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
 
-    VLOG(6) << "Run Backward Kernel with input_buffer";
+    // get target grad_var from node_input_buffer by inputmeta
+    if (input_target_nodes_inputmeta_map.find(node) !=
+        input_target_nodes_inputmeta_map.end()) {
+      VLOG(6) << "Get target result by by inputmeta";
+      // out rank_info of forward op
+      auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
+      // rank_info is a pair, first means slot_id, second means rank.
+      auto& target_result =
+          node_input_buffer->Buffers()[rank_info.first][rank_info.second];
+      // save the target result
+      results_map[node] = target_result;
+    }
+
+    // no_grad_vars
+    if (no_grad_var_nodes_inputmeta_map.find(node) !=
+        no_grad_var_nodes_inputmeta_map.end()) {
+      VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
+      auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo();
+      node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
+                                                rank_info.second);
+    }
+
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    // check input
+    EnforceGradNodeHasInput(node);
+
+    VLOG(6) << "Run Backward Kernel with GradTensorHolder";
     // Run Pre Backward Node and get outputs
     std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers());
+        (*node)(node_input_buffer->Buffers(), create_graph);
+
+    // retain_grad or not
+    if (!retain_graph) {
+      VLOG(6)
+          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
+      node->ClearTensorWrappers();
+    }
+
     // TODO(jiabin): Should we erase it or find a more efficient way.
+
     node_input_buffers_dict.erase(node);
 
     // Prepare GradTensorHolder for next node
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-
     PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
@@ -190,6 +532,7 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
     for (size_t i = 0; i < edges.size(); i++) {
       for (size_t j = 0; j < edges[i].size(); j++) {
         const Edge& edge = edges[i][j];
+
         auto edge_rank = edge.GetEdgeRankInfo();
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with
@@ -203,6 +546,7 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
             grad_output_tensors[i].empty()) {
           continue;
         }
+
         PADDLE_ENFORCE_LT(
             j, grad_output_tensors[i].size(),
             paddle::platform::errors::Fatal(
@@ -215,9 +559,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         if ((!grad_output_tensor.defined() ||
              !grad_output_tensor.initialized())) {
-          VLOG(6)
-              << "We get grad_output_tensor with slot: " << i << ", rank: " << j
-              << " as uninitialized or undefined in both tensor and variable";
+          VLOG(6) << "We get grad_output_tensor with slot: " << i
+                  << ", rank: " << j << " as uninitialized or undefined tensor";
         }
         VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
@@ -228,6 +571,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           const auto& input_meta = next_node->InputMeta();
           auto grad_tensor_holder =
               std::make_unique<GradTensorHolder>(input_meta);
+          VLOG(6) << "Construct GradTensorHolder for grad node: "
+                  << next_node->name();
           node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
@@ -237,16 +582,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         // Update queue
         node_in_degree_map[next_node]--;
-        PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0,
-                       paddle::platform::errors::Fatal(
-                           "Detected in-degree value smaller than zero."
-                           "Node's in-degree cannot be negative"));
-        if (node_in_degree_map[next_node] == 0) {
-          queue.emplace(std::move(next_node));
+
+        PADDLE_ENFORCE(
+            node_in_degree_map[next_node] >= 0,
+            paddle::platform::errors::Fatal(
+                "Detected in-degree value smaller than zero. For Node: %s"
+                "Node's in-degree cannot be negative",
+                next_node->name()));
+
+        bool is_potential_stop_node = potential_stop_nodes.count(next_node);
+
+        if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
+          ready_queue.emplace(std::move(next_node));
         }
       }
     }
   }
+
+  return GetResults(inputs, &results_map, allow_unused, create_graph);
 }
 
+void Backward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph) {
+  VLOG(6) << "Run in Backward";
+  paddle::platform::RecordEvent backward_record_event(
+      "backward", paddle::platform::TracerEventType::Operator, 1);
+  RunBackward(tensors, grad_tensors, retain_graph);
+}
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
+  VLOG(6) << "Run in Grad";
+  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
+                     allow_unused, no_grad_vars);
+}
 }  // namespace egr
diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h
index 2856d9fb87f34b1066bb59eb38bcaee786d2a260..bebe664838e6c1f98219ceee6e6733b49c319b3c 100644
--- a/paddle/fluid/eager/backward.h
+++ b/paddle/fluid/eager/backward.h
@@ -19,12 +19,20 @@
 
 namespace egr {
 
-// run_backward():
+// Backward():
 // tensors corresponds to those lived in the backward graph
 // each grad_tensors[i] keeps the value for its corresponding tensors[i]
-void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
-                 const std::vector<paddle::experimental::Tensor> &grad_tensors,
-                 bool retain_graph = false);
+void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
+              const std::vector<paddle::experimental::Tensor>& grad_tensors,
+              bool retain_graph = false);
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
+    bool retain_graph = false, bool create_graph = false,
+    bool only_inputs = false, bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
 
 // Reserved for gradient()
 
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ccc9a03a55660772b51dc27bbfa78b7531a369d3
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72af1cc4b068679e72ae6bdc5e09fab8f56bac04
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace egr {
+std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
+  paddle::CustomOpKernelContext ctx;
+  auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
+      grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+  for (size_t i = 0; i < grads.size(); i++) {
+    if (map[1].find(i) != map[1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
+      tmp_ins[map[1][i]] = grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  std::vector<std::vector<paddle::experimental::Tensor>> outs(
+      GetEdges().size());
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
+      grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+  for (size_t i = 0; i < GetEdges().size(); i++) {
+    if (map[0].find(i) != map[0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << GetEdges()[i].size()
+              << " to tmp_outputs: " << map[0][i];
+      for (size_t j = 0; j < GetEdges()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[1]))(&ctx);
+  return outs;
+}
+}  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ece2658575c795856438904c2716d61f0985879
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/utils/any.h"
+
+namespace egr {
+class RunCustomOpNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num,
+                           const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpNode for op: " << op_type;
+  }
+
+  ~RunCustomOpNode() override {
+    VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph) override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover(nullptr));
+    }
+    return res;
+  }
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 427be83c3bbee31eaa0c7e3d26d2d9599b344450..891ad4d8983b5b37b31ab5f5f980e74ccff47069 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -15,17 +15,23 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/var_type.h"
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
 
 /**
- * Implementation of GradNodeBase, Edge and InputBuffer.
+ * Implementation of GradNodeBase, Edge and GradTensorHolder.
 **/
 namespace egr {
 
@@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
-  // adj_edges has the same num as backward outputs
   adj_edges_.resize(bwd_out_slot_num);
 }
 
@@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
-  for (const auto& meta : *metas) {
+
+  for (size_t i = 0; i < metas->size(); i++) {
+    const auto& meta = (*metas)[i];
     // adj_edges has as same rank as fwd inputs, and record it's output rank
     // from
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node && node.get()) {
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
-      } else {
+      if (!node || !node.get()) {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
       }
+
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
     }
   }
 }
@@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
+
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node && node.get()) {
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
-    } else {
+    if (!node || !node.get()) {
       meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
     }
+    VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+            << this->name() << " to " << meta->GetMutableGradNode()->name();
+
+    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                     meta->OutRankInfo());
   }
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::InputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::InputMeta() const {
   return bwd_in_meta_;
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::OutputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
   return bwd_out_meta_;
 }
 
-void GradNodeBase::SetGradInMeta(std::vector<AutogradMeta*>* fwd_out,
+void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
-  size_t slot_size = fwd_out->size();
+  auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, addition "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
-  // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i],
-                            paddle::platform::errors::PreconditionNotMet(
-                                "Bwd_in_meta should only be called while "
-                                "autograd_meta is not null. If you got this "
-                                "error, it indicates bugs in framework."));
-    if ((*fwd_out)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient());
+  auto& metas = bwd_in_meta_.at(slot_rank);
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+
+  auto& meta = metas[0];
+  meta.SetStopGradient(fwd_out_meta->StopGradient());
+
+  // Record TensorMeta
+  if (phi::DenseTensor::classof(fwd_out.impl().get())) {
+    // Only Copy Meta
+    phi::DenseTensor* dense_tensor =
+        static_cast<phi::DenseTensor*>(fwd_out.impl().get());
+
+    PADDLE_ENFORCE_NE(
+        dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+        paddle::platform::errors::Fatal(
+            "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
+            "which is illegal."));
+    meta.SetTensorMeta(dense_tensor->meta());
+
+    if (paddle::framework::IsComplexType(
+            paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+      need_complex_to_real_ = true;
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) {
+void GradNodeBase::SetGradInMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_out,
+    size_t slot_rank) {
+  size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_in_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
-  meta.Init(1);
-  meta.SetStopGradient(0, fwd_out->StopGradient());
+  if (metas.size() < slot_size) {
+    VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    auto& meta = metas[i];
+    const auto& fwd_out_tensor = fwd_out[i];
+    auto* fwd_out_meta =
+        egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor);
+    PADDLE_ENFORCE_NOT_NULL(fwd_out_meta,
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
+    if (fwd_out_meta->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_out_meta->StopGradient());
+    }
+
+    // Record TensorMeta
+    if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_out_tensor.impl().get());
+
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      if (paddle::framework::IsComplexType(
+              paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+        need_complex_to_real_ = true;
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
+  }
 }
 
-void GradNodeBase::SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in,
+void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
                                   size_t slot_rank) {
-  size_t slot_size = fwd_in->size();
+  auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in);
   PADDLE_ENFORCE_LE(
-      slot_rank, (bwd_out_meta_.size() - 1),
+      (slot_rank + 1), bwd_out_meta_.size(),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    if (!(*fwd_in)[i]) {
-      meta.SetStopGradient(i, true);
-      continue;
-    }
-    if ((*fwd_in)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient());
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+  auto& meta = metas[0];
+  if (fwd_in_meta) {
+    meta.SetStopGradient(fwd_in_meta->StopGradient());
+  } else {
+    meta.SetStopGradient(true);
+  }
+
+  // Record TensorMeta
+  if (fwd_in.impl() && fwd_in.impl().get()) {
+    if (phi::DenseTensor::classof(fwd_in.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_in.impl().get());
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) {
+void GradNodeBase::SetGradOutMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_in, size_t slot_rank) {
+  size_t slot_size = fwd_in.size();
   PADDLE_ENFORCE_LE(
-      (slot_rank + 1), bwd_out_meta_.size(),
+      slot_rank, (bwd_out_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(1);
-  if (fwd_in) {
-    meta.SetStopGradient(0, fwd_in->StopGradient());
-  } else {
-    meta.SetStopGradient(0, true);
+  if (metas.size() < slot_size) {
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    const auto& fwd_in_tensor = fwd_in[i];
+    auto& meta = metas[i];
+    auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor);
+    if (fwd_in_meta) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_in_meta->StopGradient());
+    }
+
+    // Record TensorMeta
+    if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
+      if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
+
+        PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+                          paddle::platform::errors::Fatal(
+                              "Attempting to copy DenseTensorMeta with "
+                              "phi::DataType::UNDEFINED,"
+                              "which is illegal."));
+        meta.SetTensorMeta(dense_tensor->meta());
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
   }
 }
 
@@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() {
                      "meta setter, other size of inputs and outputs should "
                      "create with Setter and Getters"));
   // Default stop_gradient is false and slot id is 0, slot size is 1;
-  bwd_out_meta_[0].Init(1);
-  bwd_in_meta_[0].Init(1);
-}
-
-const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
-  return adj_edges_;
+  bwd_out_meta_[0].resize(1);
+  bwd_in_meta_[0].resize(1);
 }
 
 int64_t GradNodeBase::RegisterGradientHook(
@@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook(
   return next_hook_id_++;
 }
 
+const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
+  return adj_edges_;
+}
+
 std::vector<std::vector<paddle::experimental::Tensor>>
 GradNodeBase::ApplyGradientHooks(
     const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
@@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks(
   return outs;
 }
 
+void GradNodeBase::HandleComplexGradToRealGrad(
+    std::vector<std::vector<paddle::experimental::Tensor>>* out_grads) {
+  for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) {
+    const std::vector<paddle::experimental::Tensor>& slot_out_grads =
+        (*out_grads)[slot_id];
+    for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) {
+      const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id];
+
+      PADDLE_ENFORCE(
+          slot_meta.HasTensorMeta() > 0,
+          paddle::platform::errors::Fatal(
+              "We require TensorMeta in GradInputMeta() to obtain forward data "
+              "types."
+              "However, no TensorMeta is detected in bwd_out_meta_."));
+
+      auto fwd_data_type = paddle::framework::TransToProtoVarType(
+          slot_meta.GetTensorMeta().dtype);
+      const paddle::experimental::Tensor& grad = slot_out_grads[rank_id];
+
+      if (paddle::framework::IsComplexType(fwd_data_type)) continue;
+
+      // Only Handle Complex To Real for DenseTensor for now
+      if (phi::DenseTensor::classof(grad.impl().get())) {
+        phi::DenseTensor* grad_dense_tensor =
+            static_cast<phi::DenseTensor*>(grad.impl().get());
+
+        auto curr_data_type =
+            paddle::framework::TransToProtoVarType(grad_dense_tensor->type());
+        if (!paddle::framework::IsComplexType(curr_data_type)) continue;
+
+        // Convert Complex GradOut to Real
+        auto out = std::make_shared<phi::DenseTensor>();
+        paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type,
+                                              *grad_dense_tensor, out.get());
+
+        (*out_grads)[slot_id][rank_id].set_impl(out);
+      }
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 16513f05e0777a8e57f54c925d68867dda656612..4b21a193ee021f06538e1a11bbffb898376739a7 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -57,21 +57,28 @@ class AutogradMeta;
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
-  void Init(size_t size) {
-    size_ = static_cast<int>(size);
-    stop_gradient_.resize(size, false);
+  bool IsStopGradient() const { return stop_gradient_; }
+  void SetStopGradient(bool stop_gradient = true) {
+    stop_gradient_ = stop_gradient;
   }
 
-  bool IsInitialized() const { return size_ != -1; }
-  bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; }
-  int Size() const { return size_; }
-  void SetStopGradient(size_t rank, bool stop_gradient = true) {
-    stop_gradient_.at(rank) = stop_gradient;
+  void SetTensorMeta(const phi::DenseTensorMeta& meta) {
+    meta_ = std::make_shared<phi::DenseTensorMeta>(meta);
+  }
+  bool HasTensorMeta() const { return meta_ && meta_.get(); }
+  const phi::DenseTensorMeta& GetTensorMeta() const {
+    if (!HasTensorMeta()) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "meta_ of GradSlotMeta has not been initialized yet."
+          "You're expected to check Edge availability with HasTensorMeta()"
+          "before calling GetTensorMeta() interface."));
+    }
+    return *meta_.get();
   }
 
  private:
-  int size_{-1};
-  std::vector<bool> stop_gradient_{false};
+  bool stop_gradient_{false};
+  std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
 };
 
 class GradNodeBase {
@@ -95,8 +102,12 @@ class GradNodeBase {
    * is better choice to fit this format.
    * **/
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) = 0;
+
+  virtual void ClearTensorWrappers() = 0;
 
+  virtual bool IsTensorWrappersCleared() = 0;
   /**
    * AddEdges is designed to set input tensors' backward Node as current
    * node's Edges.
@@ -108,25 +119,30 @@ class GradNodeBase {
   void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
   void AddEdges(AutogradMeta* meta, size_t slot_id);
 
-  /**
-   * GetEdges is designed to get all edges of current node**/
-  const std::vector<std::vector<Edge>>& GetEdges() const;
+  // adj_edges were moved inside OutputMeta(), so no available direct access
+  // from GradNodeBase.
+  // To access Edges, get GradSlotMeta by calling OutputMeta(), then use
+  // slot_meta.GetEdge()
 
   /**
    * Get Input Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& InputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& InputMeta() const;
   /**
    * Get Output Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& OutputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& OutputMeta() const;
   /**
    * Set bwd ins and outs info with forward vars
    * **/
 
-  void SetGradInMeta(std::vector<AutogradMeta*>* fwd_out, size_t slot_rank);
-  void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank);
+  void SetGradInMeta(const std::vector<paddle::experimental::Tensor>& fwd_out,
+                     size_t slot_rank);
+  void SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
+                     size_t slot_rank);
 
-  void SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in, size_t slot_rank);
-  void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank);
+  void SetGradOutMeta(const std::vector<paddle::experimental::Tensor>& fwd_in,
+                      size_t slot_rank);
+  void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
+                      size_t slot_rank);
 
   /**
    * Default setters for Grad in/out meta this should be used for same special
@@ -158,11 +174,21 @@ class GradNodeBase {
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
 
+  /**
+    * Handle Complex - Real Type Promotion
+    * **/
+  void HandleComplexGradToRealGrad(
+      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads);
+  bool NeedComplexToRealConversion() { return need_complex_to_real_; }
+
   virtual std::string name() { return "GradNodeBase"; }
 
- private:
-  // TODO(jiabin): Use SmallVector instead after merge PR from develop
+  /**
+       * GetEdges is designed to get all edges of current node**/
+  const std::vector<std::vector<Edge>>& GetEdges() const;
 
+ private:
+  // TODO(zhanlve): Merge adj_edges_ into GradOutMeta
   // Edges recorded the backward related node info, which indicate all edges
   // linked
   // by this Grad Node.
@@ -170,10 +196,10 @@ class GradNodeBase {
   std::vector<std::vector<Edge>> adj_edges_;
 
   // bwd_out_meta_ is used to record Grad output info for backward
-  std::vector<GradSlotMeta> bwd_out_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_out_meta_;
 
   // bwd_in_meta_ used to record Grad input info for backward
-  std::vector<GradSlotMeta> bwd_in_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_in_meta_;
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
@@ -184,6 +210,8 @@ class GradNodeBase {
                         /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
 
+  // We handle complex to real conversion only if any complex GradIn is involved
+  bool need_complex_to_real_ = false;
   int64_t next_hook_id_{0};
 };
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 69fc7df2f1420382735cf59fbe85f7e2207d0f77..163d25e85ce8c085087331c6e3273075aed5e5f4 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -21,6 +21,11 @@
 
 namespace egr {
 
+void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  buffer_[slot_id][rank] =
+      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
+}
+
 void GradTensorHolder::add(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool fill_one) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index d66a81fe8285980bad4159d5414985dc9c744549..8c00f9161b629f7a3f093a1225d3d5b0b9bcca8b 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -26,12 +26,13 @@ namespace egr {
  * GradTensorHolder should have as same format as forward output **/
 class GradTensorHolder {
  public:
-  explicit GradTensorHolder(const std::vector<GradSlotMeta>& meta) {
-    VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size();
-    buffer_.resize(meta.size());
+  explicit GradTensorHolder(
+      const std::vector<std::vector<GradSlotMeta>>& metas) {
+    VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
+    buffer_.resize(metas.size());
     for (size_t i = 0; i < buffer_.size(); i++) {
-      VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size();
-      buffer_[i].resize(meta[i].Size());
+      VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size();
+      buffer_[i].resize(metas[i].size());
     }
   }
 
@@ -56,6 +57,8 @@ class GradTensorHolder {
     return buffer_;
   }
 
+  void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
+
  private:
   std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
 };
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 31aaa93c41643f565836c536d7001c01d2a0826d..8da27f3bb8a13a759bd12737746ce6add4b1aaa5 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -36,6 +36,15 @@ class TensorWrapper {
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
                          bool full_reserved = false,
                          bool no_need_buffer = false) {
+    // set inplace_version_snapshot_ according to tensor's current inplace
+    // version.
+    if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+      inplace_version_snapshot_ = inplace_version_counter.CurrentVersion();
+    }
+
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -49,6 +58,7 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
+    no_need_buffer_ = no_need_buffer;
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
         // Only Copy Meta
@@ -86,6 +96,7 @@ class TensorWrapper {
 
     // if it's full_reserved just return the full copy of tensor
     if (full_reserved_) {
+      check_inplace_version();
       return intermidiate_tensor_;
     } else {
       std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
@@ -94,13 +105,52 @@ class TensorWrapper {
       intermidiate_tensor_.set_autograd_meta(
           std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
               p_ab_autograd_meta));
+      check_inplace_version();
       return intermidiate_tensor_;
     }
   }
 
+  void check_inplace_version() {
+    if (no_need_buffer_) {
+      VLOG(6) << "There's no need to check inplace_version because "
+                 "no_need_buffer_ is true.";
+      return;
+    }
+    if (intermidiate_tensor_.impl() &&
+        phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+
+      uint32_t current_inplace_version =
+          inplace_version_counter.CurrentVersion();
+      PADDLE_ENFORCE_EQ(
+          current_inplace_version, inplace_version_snapshot_,
+          paddle::platform::errors::PermissionDenied(
+              "Tensor '%s' used in gradient computation has been "
+              "modified by an inplace operation. "
+              "Its version is %d but the expected version is %d. "
+              "Please fix your code to void calling an inplace operator "
+              "after using the Tensor which will used in gradient "
+              "computation.",
+              intermidiate_tensor_.name(), current_inplace_version,
+              inplace_version_snapshot_));
+      VLOG(6) << " The inplace_version_snapshot_ of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << inplace_version_snapshot_ << " ]";
+      VLOG(6) << " The current_inplace_version of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << current_inplace_version << " ]";
+    }
+  }
+
+  void clear() { intermidiate_tensor_.reset(); }
+
  private:
   bool full_reserved_ = false;
+  bool no_need_buffer_ = false;
   std::pair<size_t, size_t> out_rank_info_;
   paddle::experimental::Tensor intermidiate_tensor_;
+  uint32_t inplace_version_snapshot_ = 0;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 1683f4ed5fbe5e4b014e9b369e0231d149c187f1..c8b2d22dcf95139db47704be86a6f64554f7c0ba 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -17,6 +17,14 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
+#endif
 
 namespace eager_test {
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
   CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
   t4.reset();
   CHECK(t4.defined() == false);
+
+  VLOG(6) << "Check Tensor Copy_";
+  std::vector<int64_t> rows = {1, 2};
+  std::vector<int64_t> dims = {2};
+  paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
+  std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+      ->mutable_value()
+      ->Resize(phi::make_ddim(dims));
+  auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+                          ->mutable_value()
+                          ->mutable_data<float>(paddle::platform::CPUPlace());
+  dt7_tmp_ptr[0] = 6.0f;
+  dt7_tmp_ptr[1] = 11.0f;
+
+  paddle::experimental::Tensor t8;
+  paddle::experimental::Tensor t5;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  paddle::experimental::Tensor t6;
+  paddle::experimental::Tensor t9;
+  VLOG(6) << "Check Tensor Copy_ Selected Rows";
+  t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
+  t9.copy_(t8, paddle::platform::CPUPlace(), true);
+  auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
+                          ->value()
+                          .data<float>();
+  CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
+  CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
+  CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
+           2);
+
+  VLOG(6) << "Check Tensor Copy_ Dense Tensor";
+  t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
+  t6.copy_(t5, paddle::platform::CPUPlace(), true);
+  auto* dt6_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
+  CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
+#else
+  t5.copy_(t3, paddle::platform::CPUPlace(), true);
+  auto* dt5_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
+  CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
+#endif
+
   VLOG(6) << "Finish";
 }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index e3db309c4016a512c5379fb352beb4af690a271e..d592b5ccf66ffc8532214a72612e9308b7e51fe5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
@@ -23,14 +24,9 @@
 
 TEST(GradNodeInfo, GradSlotMeta) {
   auto grad_slot = egr::GradSlotMeta();
-  CHECK(grad_slot.IsInitialized() == false);
-  VLOG(6) << "Init GradSlotMeta";
-  grad_slot.Init(2);
-  CHECK(grad_slot.IsInitialized() == true);
   VLOG(6) << "Set SetStopGradient";
-  grad_slot.SetStopGradient(0);
-  CHECK(grad_slot.IsStopGradient(0) == true);
-  CHECK_EQ(grad_slot.Size(), 2);
+  grad_slot.SetStopGradient();
+  CHECK(grad_slot.IsStopGradient() == true);
 }
 
 void TestGradNodeBase(bool is_remove_gradient_hook) {
@@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
                ->data<float>()[0],
            6.0f);
   VLOG(6) << "Test Add Edges";
-  egr::Edge edge0(grad_test_node1, 1, 2);
-  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
+  egr::Edge tmp_edge0(grad_test_node1, 1, 2);
+  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(tmp_edge0);
   auto_grad0->SetStopGradient(false);
-  egr::Edge edge1(grad_test_node1, 3, 4);
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
+
+  egr::Edge tmp_edge1(grad_test_node1, 3, 4);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(tmp_edge1);
+  et1.set_autograd_meta(auto_grad1);
   auto_grad1->SetStopGradient(false);
   grad_test_node0->AddEdges(auto_grad0.get(), 0);
+
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
            size_t(1));
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
            size_t(2));
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
+
   grad_test_node0->AddEdges(&metas, 1);
   CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
            size_t(3));
@@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
 
   VLOG(6) << "Test Set Meta and Get Meta";
   auto_grad1->SetStopGradient(true);
-  grad_test_node0->SetGradInMeta(&metas, 0);
-  grad_test_node0->SetGradInMeta(auto_grad1.get(), 1);
-  grad_test_node0->SetGradOutMeta(&metas, 0);
-  grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1);
-  CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0));
-  CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0));
+  grad_test_node0->SetGradInMeta(et1, 0);
+  grad_test_node0->SetGradInMeta({et1}, 1);
+  grad_test_node0->SetGradOutMeta(et1, 0);
+  grad_test_node0->SetGradOutMeta({et1}, 1);
+  CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient());
+  CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient());
+  CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
 
   VLOG(6) << "Test Default Set Meta and Get Meta";
   auto grad_test_node2 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 1, /* out_num */ 1);
   grad_test_node2->SetDefaultGradInOutMeta();
-  CHECK(grad_test_node2->OutputMeta()[0].IsInitialized());
-  CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false);
-  CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1);
+  CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0));
+  CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false);
+  CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1));
 
   VLOG(6) << "Test Gradient Hook";
   auto gradient_hook = [](
@@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) {
 }
 
 TEST(GradNodeInfo, Edge) {
+  phi::DenseTensorMeta meta =
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+  std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
+      meta);
+  paddle::experimental::Tensor et1(dt);
+
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(5, 2, 2);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   VLOG(6) << "Test Construct Edge";
   egr::Edge edge0 = egr::Edge();
   CHECK(edge0.IsInitialized() == false);
@@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) {
       egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0)));
   VLOG(6) << "Test Set Edge's Grad Node";
   auto* grad_node = edge1.GetGradNode();
+  et1.set_autograd_meta(auto_grad1);
+  grad_node->SetGradInMeta(et1, 0);
+
   CHECK_EQ(grad_node->InputMeta().size(), size_t(2));
-  auto mt_grad_node = edge1.GetMutableGradNode();
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-  // Uninitialized AutogradMeta indicates
-  mt_grad_node->SetGradInMeta(&metas, 0);
-  CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true);
+  CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true);
   VLOG(6) << "Test Get/Set Edge Rank Info";
   CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1));
   CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0));
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 535c93ac53b1751d9634476e47f32dc0cbe22708..0b167203735d65683b0f978fa34fe7f457aae4f2 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
     std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
     return res;
   }
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
   float val_;
 };
 }  // namespace eager_test
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 384fdcd6f97c4b318341db68cdd88b644d42d22a..645eac06ddda519bba952abb460571c9667c6d4a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -30,8 +30,7 @@ PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 using namespace egr;  // NOLINT
 
 TEST(GradTensorHolder, Constructor) {
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta});
   GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder);
 
@@ -72,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) {
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
@@ -138,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) {
   paddle::experimental::Tensor t2(sr2);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index af365322e606ebfaecb7233751cacc6aa1aac423..056c7102f663b93d215e494908d9c95be832068c 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -40,6 +40,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
 
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
@@ -80,6 +82,47 @@ TEST(Benchmark, EagerScaleCPU) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCPU) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  for (const std::string& mode : {"Accuracy", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cpu.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 5b75f1242e69bc5b37dd97467b7c55bfc6bc3871..5e790389819f53b250db8797c7a8b3466818abfb 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -44,6 +44,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 
 TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
@@ -82,6 +84,50 @@ TEST(Benchmark, EagerScaleCUDA) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCUDA) {
+  paddle::platform::CUDAPlace place;
+  eager_test::InitEnv(place);
+
+  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "WarmUp") {
+      benchmark_eager_matmul(X, Y);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cuda.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
   eager_test::InitEnv(place);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index a9d297c1c64f7b64373237a0500802a5c883aedd..b4b47a85f66662347d5e087cd4391979fb6c4250 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -41,6 +41,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index bd9eaa09ca9a406da943c8a0b0f37b674d5ea3c2..a3e393b039425e506066b485bc8a8688bff20d96 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -43,6 +43,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 96126fa5466aace442dfb742f9902539916b853e..c8fb6050e9d450d598ea722ac74da924e8857f0e 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/utils.h"
 
 // Eager Generated
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 
 // Fluid
@@ -36,7 +37,7 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-static size_t max_num_benchmark_runs = 5000;
+static size_t max_num_benchmark_runs = 4000;
 
 namespace egr {
 
@@ -57,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 }
 
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
+                            bool accuracy_check) {
+  paddle::experimental::Tensor input_tensor0 = X;
+
+  size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
+  for (size_t i = 0; i < max_num_runs; i++) {
+    input_tensor0 =
+        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+  }
+
+  std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
+  Backward(target_tensors, {});
+
+  if (accuracy_check) {
+    // Examine Forward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
+    // Examine Backward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
+  }
+}
+
 /* ----------------------------------- */
 /* ---- Eager Intermediate Matmul ---- */
 /* ----------------------------------- */
@@ -82,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -113,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
       reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
 
   std::vector<paddle::experimental::Tensor> target_tensors = {Out};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     std::unordered_map<std::string, float> result =
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
                            bool accuracy_check = false);
 
 /* ---- Eager MatMul ---- */
-/*
-void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const
-paddle::experimental::Tensor& Y,
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
                             bool accuracy_check = false);
-void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
-                         const std::vector<paddle::experimental::Tensor>& Ws,
-                         const std::vector<paddle::experimental::Tensor>& Bs,
-                         bool accuracy_check = false);
-*/
+
 void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
                                          const paddle::experimental::Tensor& Y,
                                          bool accuracy_check = false);
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd..52dba6b9218c7be8a29ae1aff619facd25a6f3b6 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
 cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 0c894ed267fcdd08d44d4df08bfaf0554874aebf..87f8f6eca1f88fe9a54583ee19586dd75c7e231e 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -33,6 +33,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 namespace egr {
 
@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   }
 
   // Run Backward
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
   }
 
   // Use Empty Grad Tensor
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
     node2_ptr->AddEdges(&res2, 0);
   }
 
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 36594f1aac8cdb131bb77f1396dca19a0c2e8cc0..8b0759c17ed3712079e8954df60e35afaaf02a9e 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   std::vector<egr::AutogradMeta*> res = {meta};
   scale_node_ptr->AddEdges(&res, 0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a..882695e98d109e09340223e21322a02d1b48c6ea 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   VLOG(7) << "Target Grad is: "
           << std::static_pointer_cast<phi::DenseTensor>(
@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
 
   std::vector<paddle::experimental::Tensor> outs = {out1};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   // leaf grad
@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 2a5ad53204a6201149bec0b3dac0fa3baf441f2e..49e517dc9b3f3271ef26dfbece46f799ef805c57 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
   eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
@@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {
 
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b03799c48659c579938df6efc0f7cf57bbc0bec
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+namespace egr {
+
+TEST(Grad, SingleNodeEmptyGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor (output)
+  paddle::experimental::Tensor output_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+
+  // Create input tensor
+  const paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Output_tensor set GradNode、OutRank、StopGradient propertis
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    // Get autograd_meta from input tensor
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::unsafe_autograd_meta(leaf_tensor);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    // input tensor set GradNode、OutRank、StopGradient propertis
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // grad_node Add Edges
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+  std::vector<paddle::experimental::Tensor> outs = {output_tensor};
+
+  // Run Grad
+  auto result = Grad(outs, {leaf_tensor}, {});
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 5.0);
+}
+
+TEST(Grad, SingleNodeCustomGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  // Create Grad Tensor
+  paddle::experimental::Tensor grad_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Tensor and Node via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+Node1
+  |
+Node0
+  |
+ { } // empty grad tensor
+*/
+TEST(Grad, LinearNodes) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta for node0
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+
+    // Set grad in/out meta for node1
+    node1_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Input Tensor and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+    // Connect Node0 -> Node1 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node1_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
+    node1_ptr->AddEdges(&res1, 0);
+  }
+
+  // Use Empty Grad Tensor
+  auto result = Grad(target_tensors, {leaf_tensor}, {});
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+    Node2
+    |   |
+Node0   Node1
+  |      |
+ in0   in1
+*/
+TEST(Grad, WithAccumulation) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor0));
+  target_tensors.emplace_back(std::move(tensor1));
+
+  // Create Grad Tensor
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  paddle::experimental::Tensor grad_tensor0 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor grad_tensor1 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor0));
+  grad_tensors.emplace_back(std::move(grad_tensor1));
+
+  paddle::experimental::Tensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+    node1_ptr->SetDefaultGradInOutMeta();
+    // Create Node2
+    auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node2_ptr->SetAttributes_scale(20.0 /*scale*/);
+    node2_ptr->SetDefaultGradInOutMeta();
+    // Connect Inp0 and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta0 =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta0->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta0->SetStopGradient(false);
+    // Connect Inp1 and Node1 via AutoGradMeta
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::autograd_meta(&(target_tensors[1]));
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // Connect Node0 -> Node2 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Node1 -> Node2 via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
+
+    auto_grad_meta2->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
+    node2_ptr->AddEdges(&res2, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index d546df4ed087a99a28096a5336fab3826991534a..2c53fc89f650e36f1435c7e1e805453fe7822cf2 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 56813c498d2410caa452da7a334c393b230c65bf..b86865e2d126fbfc0b00495a6e3208932ac6de39 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
   }
 
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
   eager_test::CompareGradTensorWithValue<float>(
@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(
@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(
@@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
 }
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 6f8bccd64e45f015a5c1aed44fbfdfc6f68660f1..277319bc700b652855576db248463b424846e2e9 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -57,6 +57,7 @@ inline void run_program_dygraph_function(
     auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
 
     grad_node->SetFwdOutNames(out_names);
+    grad_node->SetOut(out);
     // Set Attributes
     grad_node->SetAttrMap(attrs);
     // Set TensorWrappers
@@ -65,10 +66,10 @@ inline void run_program_dygraph_function(
     grad_node->SetStepScope(step_scope);
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
-    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+    grad_node->SetGradOutMeta(x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
 
-    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    grad_node->SetGradInMeta(deref_out, 0);
     // Set Next Edges
     grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
     grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index ae5d86664a346fd8a1d877f9e1dd74f687302595..4eaa64d3ac659ca0ec76083b70855d8b6b241556 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -260,9 +260,9 @@ inline void RunProgramAPI(
   }
   VLOG(2) << "The number of sub scopes after forward: "
           << out_scope_vec->front()->kids().size();
-  // #ifdef PADDLE_WITH_MKLDNN
-  //     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
-  // #endif
+#ifdef PADDLE_WITH_MKLDNN
+  if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+#endif
 }
 
 inline void RunProgramGradAPI(
@@ -357,7 +357,7 @@ inline void RunProgramGradAPI(
   details::ShareTensorsFromScope(params_grad, *global_block, &scope);
 
   // Step5. drop current scope
-  // global_inner_scope->DeleteScope(&scope);
+  global_inner_scope->DeleteScope(&scope);
   VLOG(2) << "The number of sub scopes after backward: "
           << global_inner_scope->kids().size();
 }
@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
+      bool create_graph) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     PADDLE_ENFORCE_EQ(
         grads.size(), 1,
@@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
         paddle::platform::errors::InvalidArgument(
             "The grads[0].size() and fwd_out_names_.size() should be equal."));
     for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad();
+      const_cast<paddle::experimental::Tensor &>(out_grad).set_impl(
+          grads[0][i].impl());
+
       const_cast<paddle::experimental::Tensor &>(grads[0][i])
           .set_name(fwd_out_names_[i] + "@GRAD");
     }
@@ -411,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   // SetAttrMap
   void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
     attrs_ = attrs;
@@ -432,6 +442,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     fwd_out_names_ = out_names;
   }
 
+  void SetOut(const std::vector<paddle::experimental::Tensor *> &out) {
+    out_ = out;
+  }
+
  protected:
   void ConstructGradTensors(
       const std::vector<paddle::experimental::Tensor> &fwd_tensors,
@@ -440,7 +454,11 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // such as: name, tensor type(DenseTensor or SelectedRows).
     VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
     for (auto &fwd_t : fwd_tensors) {
-      grad_tensors->emplace_back(fwd_t.impl());
+      if (phi::DenseTensor::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (phi::SelectedRows::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
       auto &grad_t = grad_tensors->back();
       grad_t.set_name(fwd_t.name() + "@GRAD");
     }
@@ -462,6 +480,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   std::vector<paddle::framework::Scope *> step_scope_;
 
   std::vector<std::string> fwd_out_names_;
+  std::vector<paddle::experimental::Tensor *> out_;
 
   // Attribute Map
   paddle::framework::AttributeMap attrs_;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 8a57d2694535e9c27e88416468fe5a67ce020b43..048087903a47c1699a7d7f32199c313146bd37ab 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -212,6 +212,27 @@ std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
   return res;
 }
 
+void EagerUtils::ModifyInplaceInput(
+    const std::shared_ptr<EagerVariable>& inplace_variable,
+    paddle::experimental::Tensor* inplace_tensor) {
+  // Only modify the meta information of the inplace tensor, because
+  // EagerVariable cannot modify Tensor's meta information after inplace
+  // op (such as ``reshape``) is executed.
+  PADDLE_ENFORCE_NOT_NULL(inplace_tensor,
+                          paddle::platform::errors::Fatal(
+                              "Inplace Tensor is null and cannot be modified. "
+                              "We are tring to Modify Inplace Input from its "
+                              "shared_ptr, this error may indicate the inplace "
+                              " input is nullptr"));
+  if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) {
+    phi::DenseTensor* variable_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_variable->GetTensorBase().get());
+    phi::DenseTensor* tensor_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_tensor->impl().get());
+    tensor_dense_tensor->set_meta(variable_dense_tensor->meta());
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index fa5735e6f32a0ca7762b9ba94cce26ac8ac567dd..fbd080ef70e25408abcb979360610ad08d752f96 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -144,6 +145,19 @@ class EagerUtils {
     iter.apply(std::forward<Args>(args)...);
   }
 
+  static void CheckInplace(const paddle::experimental::Tensor& target,
+                           const AutogradMeta* autograd_meta,
+                           bool require_any_grad) {
+    if (require_any_grad && autograd_meta) {
+      PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(target),
+                        false, paddle::platform::errors::InvalidArgument(
+                                   "Leaf Var (%s) that doesn't stop gradient "
+                                   "can't use inplace strategy.",
+                                   target.name()));
+    }
+  }
+
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(
       TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
@@ -171,6 +185,9 @@ class EagerUtils {
   static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
       const size_t num);
   // Construct Tensor From var
+  static void ModifyInplaceInput(
+      const std::shared_ptr<EagerVariable>& inplace_variable,
+      paddle::experimental::Tensor* inplace_tensor);
   static std::vector<paddle::experimental::Tensor> GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index aa92a3b2226c1fca1fa7326e76ef29b0b38cd8d6..5dc3d9e89c557e86f5af821446b82ad691ad5c95 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
+
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index b9e3bee25f6b5377dde7b525138643964fd8366a..478e39b99dcc9935306603a48810d46ba792d3c3 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
@@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap(
 ////////////////////// User APIs ///////////////////////
 
 // load op api
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
   void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
   VLOG(3) << "load custom_op lib: " << dso_name;
   typedef OpMetaInfoMap& get_op_meta_info_map_t();
   auto* get_op_meta_info_map =
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
   auto& op_meta_info_map = get_op_meta_info_map();
-
   RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
+  return op_meta_info_map.GetMap();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 4310b564371822d0238a55b9091f524d8d419966..fef1e82a14fe6e03de40c8376f922f87f64564f8 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -20,9 +20,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
 // Load custom op api: register op after user compiled
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 
 // Register custom op api: register op directly
 void RegisterOperatorWithMetaInfoMap(
@@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap(
 // Interface for selective register custom op.
 void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
                                   void* dso_handle = nullptr);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 1a4f283f511da4300d26e764907998ad647eeebf..589d09bf81c1d95795cd80ed22581e52156ae417 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
     return;
   }
 
+  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
+    return;
+  }
+
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 48850d4624a14c32d30e4562b322115127823c6b..f951b5d0f507039decfc3b4d0081f17cc9f8f50e 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool force_disable_gc, bool keep_kid_scopes) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ctx->ops_, place_);
 #endif
-  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars,
                      keep_kid_scopes);
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 17346f5fd939324e6c2d709fb09be2cb65669429..2b8b4b3ff9573f601f8da3092c18433a49a93869 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,8 +10,9 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
     nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
+    nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..235f7a226ad17649960d1e72d7907e8013e406fe
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index a6508bf96c00f835da4aee79503f16fa5451e794..b8f9f0bfec9b2a0bf6b6fb1e122e40b3eaa90fa8 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -14,114 +14,25 @@
 
 #pragma once
 #include "heter_comm.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int neighbor_size, neighbor_offset;
-  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
-  // neighbor_size) of int64_t *neighbor_list;
-};
-
-struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
-  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-};
-
-/*
-suppose we have a graph like this
 
-0----3-----5----7
- \   |\         |\
- 17  8 9        1 2
-
-we save the nodes in arbitrary order,
-in this example,the order is
-[0,5,1,2,7,3,8,9,17]
-let us name this array u_id;
-we record each node's neighbors:
-0:3,17
-5:3,7
-1:7
-2:7
-7:1,2,5
-3:0,5,8,9
-8:3
-9:3
-17:0
-
-by concatenating each node's neighbor_list in the order we save the node id.
-we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
-this is the neighbor_list of GpuPsCommGraph
-given this neighbor_list and the order to save node id,
-we know,
-node 0's neighbors are in the range [0,1] of neighbor_list
-node 5's neighbors are in the range [2,3] of neighbor_list
-node 1's neighbors are in the range [4,4] of neighbor_list
-node 2:[5,5]
-node 7:[6,6]
-node 3:[9,12]
-node 8:[13,13]
-node 9:[14,14]
-node 17:[15,15]
-...
-by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
-*/
-struct NeighborSampleResult {
-  int64_t *val;
-  int *actual_sample_size, sample_size, key_size;
-  NeighborSampleResult(int _sample_size, int _key_size)
-      : sample_size(_sample_size), key_size(_key_size) {
-    actual_sample_size = NULL;
-    val = NULL;
-  };
-  ~NeighborSampleResult() {
-    if (val != NULL) cudaFree(val);
-    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-  }
-};
-
-struct NodeQueryResult {
-  int64_t *val;
-  int actual_sample_size;
-  NodeQueryResult() {
-    val = NULL;
-    actual_sample_size = 0;
-  };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
-};
 class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
       : HeterComm<int64_t, int, int>(1, resource) {
     load_factor_ = 0.25;
+    rw_lock.reset(new pthread_rwlock_t());
+    cpu_table_status = -1;
+  }
+  ~GpuPsGraphTable() {
+    if (cpu_table_status != -1) {
+      end_graph_sampling();
+    }
   }
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
   NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
@@ -134,9 +45,19 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
                                                  int *h_right,
                                                  int64_t *src_sample_res,
                                                  int *actual_sample_size);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int load(const std::string &path, const std::string &param);
+  virtual int32_t end_graph_sampling() {
+    return cpu_graph_table->end_graph_sampling();
+  }
 
  private:
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  int cpu_table_status;
 };
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index 839c7e5468c6c6938c6b4cda3dd879c7366e7d6e..16a6857ae96eecaaa06b92b9912387f22612f53e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
 /*
@@ -45,6 +46,33 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
   }
 }
 
+int GpuPsGraphTable::init_cpu_table(
+    const paddle::distributed::GraphParameter& graph) {
+  cpu_graph_table.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table->initialize(graph);
+  if (cpu_table_status != 0) return cpu_table_status;
+  std::function<void(std::vector<GpuPsCommGraph>&)> callback =
+      [this](std::vector<GpuPsCommGraph>& res) {
+        pthread_rwlock_wrlock(this->rw_lock.get());
+        this->clear_graph_info();
+        this->build_graph_from_cpu(res);
+        pthread_rwlock_unlock(this->rw_lock.get());
+        cv_.notify_one();
+      };
+  cpu_graph_table->set_graph_sample_callback(callback);
+  return cpu_table_status;
+}
+
+int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
+  int status = cpu_graph_table->load(path, param);
+  if (status != 0) {
+    return status;
+  }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cpu_graph_table->start_graph_sampling();
+  cv_.wait(lock);
+  return 0;
+}
 /*
  comment 1
 
@@ -68,6 +96,7 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
  that's what fill_dvals does.
 
 */
+
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
     int64_t* src_sample_res, int* actual_sample_size) {
@@ -258,7 +287,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
   int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
   int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
   auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
   int* d_shard_actual_sample_size_ptr =
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2cf702969f99a02cd2b89d69c94f42b265d46135..f85ed330dc8ea4eb4199b6ab006ac54be1b30b0d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c7ea10b26565a4181230f6150272babd315105f
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+char edge_file_name[] = "edges.txt";
+TEST(TEST_FLEET, graph_sample) {
+  std::vector<std::string> edges;
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  // std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    while (neighbor_size--) {
+      edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) +
+                      "\t1.0");
+      node_id++;
+    }
+    ind++;
+  }
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(3);
+  table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto.set_gpups_graph_sample_args("5,5,1,1");
+  prepare_file(edge_file_name, edges);
+  g.init_cpu_table(table_proto);
+  g.load(std::string(edge_file_name), std::string("e>"));
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  int64_t *res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  std::sort(res, res + 3);
+  std::sort(res + 6, res + 9);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 31a30f72e3aa6120efbaa158b1d1786dda155145..432e57107e84d9399c76f21343fe9ef5dd879473 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   } else {
     CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
     VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
@@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
 
   timeline.Start();
@@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
           local_keys[i].data(), key_size);
       bool flag = true;
 
@@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
-          local_dim_keys[i][j].data(), key_size);
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
+  VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
           << " seconds.";
   if (multi_node_) {
     auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
@@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
           << " seconds.";
 }
 
@@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() {
         "[BeginPass] after build_task, current task is not null."));
   }
 
-  VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::EndPass() {
@@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() {
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
   timer.Pause();
-  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 29c7f5d0ce73cbf1af18e6f5869d59d2200917ad..2babecc6ddf933e19b9d704ee7515f56f7431839 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto var_types = ctx_.GetOutputsVarType(name);
     return var_types[0] == proto::VarType::LOD_TENSOR;
@@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dims();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dims();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // use tensor array size as dims
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Currently, only can get dims from DenseTensor or SelectedRows."));
+            "Currently, only can get dims from DenseTensor or SelectedRows or "
+            "DenseTensorArray."));
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
@@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dtype();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dtype();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get dtype from LoDTensorArray now
+        return phi::DataType::UNDEFINED;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can get dtype from DenseTensor or SelectedRows."));
@@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor {
   DataLayout layout() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().layout();
+      if (var->IsType<phi::DenseTensor>()) {
+        return var->Get<phi::DenseTensor>().layout();
+      } else if (var->IsType<phi::SelectedRows>()) {
+        return var->Get<phi::SelectedRows>().layout();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get layout from LoDTensorArray now
+        return phi::DataLayout::UNDEFINED;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported get layout for VarDesc now
@@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
+        // Note: Here I want enforce `tensor_array->size() == 0UL`, because
+        // inplace using on LoDTensorArray is dangerous, but the unittest
+        // `test_list` contains this behavior
+        PADDLE_ENFORCE_EQ(dims.size(), 1UL,
+                          platform::errors::InvalidArgument(
+                              "LoDTensorArray can only have one dimension."));
+        // only set the array size for LoDTensorArray input
+        tensor_array->resize(dims[0]);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dims from DenseTensor or SelectedRows."));
@@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dtype from DenseTensor or SelectedRows."));
@@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor {
   void set_layout(DataLayout layout) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(tensor))
-          ->layout = layout;
+      if (var->IsType<phi::DenseTensor>()) {
+        auto* tensor = var->GetMutable<phi::DenseTensor>();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<phi::SelectedRows>()) {
+        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported set layout for VarDesc now
@@ -249,13 +298,11 @@ class CompatMetaTensor : public phi::MetaTensor {
   }
 
   void share_meta(const MetaTensor& meta_tensor) override {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
+    set_layout(meta_tensor.layout());
+    // special case: share lod of LoDTensor
     share_lod(meta_tensor);
-    share_dims(meta_tensor);
   }
 
  private:
@@ -297,7 +344,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature;
 
   // 2. build infermeta context
-  phi::InferMetaContext infer_meta_context(ctx->IsRuntime());
+  phi::InferMetaContext infer_meta_context(
+      {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()});
 
   auto& input_names = std::get<0>(signature.args);
   auto& attr_names = std::get<1>(signature.args);
@@ -441,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name, infershape_input.size()));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = attr_reader.GetAttr(attr_name);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct InferMetaContext.",
+            attr_names[i]));
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
@@ -499,8 +592,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
-    } else {
-      // do nothing
+    } else if (ctx->HasInput(attr_name)) {
+      // convert from data
+      if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
+        if (ctx->IsRuntime()) {
+          const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
+          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
+          auto val = experimental::MakePhiScalarFromVar(*var_temp);
+          int32_t val_int = val.template to<int32_t>();
+          infer_meta_context.EmplaceBackAttr(val_int);
+        } else {
+          infer_meta_context.EmplaceBackAttr(-1);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Get value from variable only support int yet"));
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a1f2d6edca6a2db5d5bb4c8cf896c492f20ed2da..7aaaef712a6e9186058b579d1c69b0cfb201d899 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
+pass_library(mixed_precision_configure_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
@@ -126,6 +127,7 @@ if(WITH_MKLDNN)
     pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
     pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
     pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 036fde8fac6d911f8a97dbc097fae7f9fdd2ab6f..f5f6f3ecb855cfa9acb6c2169f1fc43458578a2a 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
   std::unordered_map<std::string, std::pair<VarDesc *, int>>
       name_to_desc_block_id;
 
+  block_id_ = block.ID();
   const BlockDesc *block_var_visible = &block;
   while (block_var_visible != nullptr) {
     for (auto *var : block_var_visible->AllVars()) {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 21e743e3587d80536b7bd4805298f22a99482217..10645f08dc3ba833c3a4ca75a1ac623ee2c1e8e9 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -230,6 +230,7 @@ class Graph {
     auto *x =
         AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -245,6 +246,7 @@ class Graph {
                      "The OpDesc used to create operator node is null."));
     auto *x = AddNode(new ir::Node(op_desc));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -263,6 +265,7 @@ class Graph {
         num_node_created_);
     auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -276,6 +279,7 @@ class Graph {
     }
     auto *x = AddNode(new ir::Node(name, type, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index d7d866fa98bb5895e4f3175e227f7b3c2ce869b6..164a13d1560f4d0008c2bdb5a56d8ad6f875157b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()(
   return activation_out_var;
 }
 
+PDNode *patterns::ElementwiseActivation::operator()(
+    paddle::framework::ir::PDNode *elementwise_a,
+    const std::string &elementwise_type, const std::string &activation_type) {
+  // Create Operators
+  elementwise_a->assert_is_op_input(elementwise_type, "X");
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type);
+  auto *activation_op =
+      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
+  // Create variables
+  auto *elementwise_b = pattern->NewNode(elementwise_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input(elementwise_type, "Y");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsIntermediate()
+          ->assert_is_only_output_of_op(elementwise_type)
+          ->assert_is_op_input(activation_type);
+  // output
+  auto *activation_out_var = pattern->NewNode(activation_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output(activation_type);
+
+  elementwise_op->LinksFrom({elementwise_a, elementwise_b})
+      .LinksTo({elementwise_out_var});
+  activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var});
+  return activation_out_var;
+}
+
 PDNode *patterns::SeqConvEltAddRelu::operator()(
     paddle::framework::ir::PDNode *seqconv_input) {
   // Create Operators
@@ -2022,18 +2052,19 @@ PDNode *patterns::Pool::operator()() {
   return output_var;
 }
 
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
+PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var,
+                                          const std::string elementwise_type) {
+  auto elementwise_op =
+      pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
 
-  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
-  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+  x_var->AsInput()->assert_is_op_input(elementwise_type, "X");
+  y_var->AsInput()->assert_is_op_input(elementwise_type, "Y");
+  auto out_var = pattern->NewNode(elementwise_out_repr())
                      ->AsOutput()
-                     ->assert_is_op_output("elementwise_add", "Out");
+                     ->assert_is_op_output(elementwise_type, "Out");
 
-  elementwise_add_op->LinksFrom({x_var, y_var});
-  elementwise_add_op->LinksTo({out_var});
+  elementwise_op->LinksFrom({x_var, y_var});
+  elementwise_op->LinksTo({out_var});
 
   return out_var;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0f21906d08d0e4fc8a54472ab40ceb08df9d1949..17c70ace301d39db6fcf14d01c11baab0dc7d403 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase {
   PATTERN_DECL_NODE(activation_out);
 };
 
+// Elementwise with Activation
+// op: elementwise + activation
+// named nodes:
+// elementwise_a, elementwise_b,
+// elementwise_out, elementwise,
+// activation_out, activation
+struct ElementwiseActivation : public PatternBase {
+  ElementwiseActivation(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add_activation") {}
+
+  PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type,
+                     const std::string& activation_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(activation);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elementwise_b);
+  PATTERN_DECL_NODE(elementwise_out);
+  PATTERN_DECL_NODE(activation_out);
+};
+
 // SEQCONV with Elementwise_Add ReLU
 // op: seqconv + elementwise_add + relu
 // named nodes:
@@ -994,20 +1016,20 @@ struct Pool : public PatternBase {
   PATTERN_DECL_NODE(pool_output);
 };
 
-// ElementwiseAdd used in residual connections.
-// y_var is used and convolution output.
-// The operator is removed, when residual
-// connection fusion is on.
-struct ElementwiseAdd : public PatternBase {
-  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add") {}
+// Elementwise ops
+// Forward pass for element-wise operators (add, mul)
+// elementwise_mul_out is the result of the operator
+struct Elementwise : public PatternBase {
+  Elementwise(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise") {}
 
-  PDNode* operator()(PDNode* x_var, PDNode* y_var);
+  PDNode* operator()(PDNode* x_var, PDNode* y_var,
+                     const std::string elementwise_type);
 
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_x);
-  PATTERN_DECL_NODE(elementwise_add_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
+  PATTERN_DECL_NODE(elementwise_op);
+  PATTERN_DECL_NODE(elementwise_x);
+  PATTERN_DECL_NODE(elementwise_y);
+  PATTERN_DECL_NODE(elementwise_out);
 };
 
 // Transpose op
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 1b2a62695fb135925d43a3341aaacdf956da8da3..9fc6de3c8c1725707edd9f3b9f8de87706c16cc9 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch(
       varinfo_maps.at(cinn_launch_op->GetScopeIdx());
 
   // collect all MemOptVarInfos of external variables
-  // that would be eager deleted after the cinn_launch subgraph executed,
-  // and store them as attribute of the subgraph
+  // that were eager deleted after the cinn_launch subgraph executed,
+  // and we will delete them in advance among eager_deletion_ops
+  // inside cinn_launch subgraph, so store them as attribute of the subgraph
+  // to pass to the inner eager_deletion_ops.
   for (const auto& var_name : vars_to_delete) {
     auto it = src_varinfo_map.find(var_name);
     PADDLE_ENFORCE_NE(it, src_varinfo_map.end(),
@@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch(
                           "MemOptVarInfo of var[%s] not found", var_name));
     dst_varinfo_map.emplace(var_name, it->second);
   }
+  // skip running of the followed eager_deletion_op
+  followed_eager_deletion_op->SetSkipRunning(true);
 }
 
 static void TakeVarInfoFromMainGraph(
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4aa59d9196b1b4d73fffa8f1b2a9bba08d6091be
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void MixedPrecisionConfigurePass::InsertCastOps(
+    Graph* graph, const StringSet& blacklist) const {
+  VLOG(3) << "Insert the cast op before and after the kernel that does not "
+             "supports fp16 precision";
+
+  auto update_cast_desc = [&](
+      framework::OpDesc& desc, const std::string& x_name,
+      const std::string& out_name, const int in_dtype, const int out_dtype) {
+    desc.SetType("cast");
+    desc.SetInput("X", {x_name});
+    desc.SetOutput("Out", {out_name});
+    desc.SetAttr("in_dtype", in_dtype);
+    desc.SetAttr("out_dtype", out_dtype);
+    desc.SetAttr("use_mkldnn", false);
+    desc.SetAttr("with_quant_attr", false);
+    desc.Flush();
+  };
+
+  auto cast_input = [&](Graph* graph, Node* op_node,
+                        const StringSet& cast_list) {
+    auto inlinks = op_node->inputs;
+    for (auto* pre_node : inlinks) {
+      if (pre_node->IsVar()) {
+        const auto is_persistable = pre_node->Var()->Persistable();
+        const auto is_float =
+            pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* pre_node_input : pre_node->inputs) {
+            if (!pre_node_input->IsOp()) continue;
+            const auto& type = pre_node_input->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = pre_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              op_node->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(pre_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, op_node);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  auto cast_output = [&](Graph* graph, Node* op_node,
+                         const StringSet& cast_list) {
+    auto outlinks = op_node->outputs;
+    for (auto* next_node : outlinks) {
+      if (next_node->IsVar()) {
+        const auto is_persistable = next_node->Var()->Persistable();
+        const auto is_float =
+            next_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* next_node_output : next_node->outputs) {
+            if (!next_node_output->IsOp()) continue;
+
+            const auto& type = next_node_output->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = next_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              next_node_output->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(next_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, next_node_output);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    const auto& type = op_node->Op()->Type();
+    if (blacklist.count(type)) {
+      cast_input(graph, op_node, blacklist);
+      cast_output(graph, op_node, blacklist);
+    }
+  }
+}
+
+void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
+  const auto blacklist =
+      Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
+  InsertCastOps(graph, blacklist);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mixed_precision_configure_pass,
+              paddle::framework::ir::MixedPrecisionConfigurePass);
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc5a612ecb833d2a5117a2dab58747d21226df8d
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using StringSet = std::unordered_set<std::string>;
+
+class MixedPrecisionConfigurePass : public FusePassBase {
+ public:
+  MixedPrecisionConfigurePass() = default;
+  virtual ~MixedPrecisionConfigurePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+
+ private:
+  void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index c537d05738529dcb885e86cbcabf4405fd75270b..fc2758c27345032c1ad0831b4ee0016fa84b3f5c 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
@@ -117,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
+      .IsStringIn({"NHWC", "NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -135,226 +136,138 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .End();
 }
 
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
+    const std::string& name_scope,
+    const GraphWithStats& graph_with_stats) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
 
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
+  patterns::Conv conv_pattern{pattern, name_scope};
+  auto conv_output = conv_pattern();
 
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      "elementwise_add");
+  conv_output->AsIntermediate();
 
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+  int found_conv_as_x_count = 0;
 
-  (*fusion_stats)++;
-}
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-  if (HasFusedActivation(residual_conv_op)) return;
+    if (!IsReachable(g, elementwise_identity, conv_output)) return;
 
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    if (HasFusedActivation(conv_op)) return;
 
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
 
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-  (*fusion_stats)++;
-}
+    IR_NODE_LINK_TO(elementwise_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    found_conv_as_x_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_x_count
+           << " conv (as x) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
 
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_x_count + graph_with_stats.second);
 }
 
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
     const std::string& name_scope,
     const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
 
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      conv_output,
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output,
+      "elementwise_add");
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
-}
+  int found_conv_as_y_count = 0;
 
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-  patterns::Conv conv_pattern{pattern, name_scope};
-  auto conv_output = conv_pattern();
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      conv_output);
-  conv_output->AsIntermediate();
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_x, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
+
+    IR_NODE_LINK_TO(elementwise_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
+
+    found_conv_as_y_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_y_count
+           << " conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_y_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
@@ -369,44 +282,89 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   patterns::Conv conv_y_pattern{pattern, name_scope};
   auto conv_y_output = conv_y_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(conv_x_output, conv_y_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add");
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_projection_conv_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return;
+
+    Node* projection_node;
+    Node* residual_conv_op;
+    Node* residual_conv_output;
+    if (IsReachable(g, conv_x_input, conv_y_output)) {
+      projection_node = conv_x_output;
+      residual_conv_op = conv_y_op;
+      residual_conv_output = conv_y_output;
+    } else if (IsReachable(g, conv_y_input, conv_x_output)) {
+      projection_node = conv_y_output;
+      residual_conv_op = conv_x_op;
+      residual_conv_output = conv_x_output;
+    } else {
+      return;
+    }
+
+    if (HasFusedActivation(residual_conv_op)) return;
+
+    residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+
+    residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op});
+
+    IR_NODE_LINK_TO(projection_node, residual_conv_op);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_out);
+
+    found_projection_conv_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_projection_conv_count
+           << " projection conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_projection_conv_count + graph_with_stats.second);
 }
 
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
+  auto graph_with_stats =
+      FuseProjectionConv(name_scope_, std::make_pair(graph, 0));
+  graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats);
+  graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats);
 
-  LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n";
-  AddStatis(fused_graph_with_stats.second);
+  AddStatis(graph_with_stats.second);
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index c83335da2f629c128fcf4819b2645ab1ef5eae42..c4351b382187d9062a059d013ddb237520645b6d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,19 +28,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class Graph;
-class GraphPatternDetector;
-class Node;
-namespace patterns {
-struct Conv;
-}  // namespace patterns
-
-using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-paddle::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
  private:
@@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
       const std::string& name_scope,
       const GraphWithStats& graph_with_stats) const;
 
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
  public:
   ResidualConnectionMKLDNNFusePass();
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  void ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(ir::Graph* graph) const;
+
   static bool HasFusedActivation(Node* conv_node) {
     return !(conv_node->Op()
                  ->GetAttrIfExists<std::string>("fuse_activation")
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 371482b5343d638f005aa8e0700680b6ac00d6ec..f4358fb243f20bc9b024ef6b02768773fa995f45 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 
-void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
+void CPUQuantizePass::QuantizeElementwise(
+    Graph* graph, const std::string elementwise_type) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  patterns::Elementwise elementwise_pattern{pattern, name_scope_};
 
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()),
+      pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      elementwise_type);
 
-  int quantize_elementwise_add_count = 0;
+  int quantize_elementwise_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "Quantize elementwise_add op";
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
+    VLOG(4) << "Quantize " + elementwise_type + " op";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
 
     // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
-      LogQuantizationDisabled(elementwise_add_op);
+    if (!platform::HasOpINT8DataType(elementwise_op->Op())) {
+      LogQuantizationDisabled(elementwise_op);
       return;
     }
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!AreScalesPresentForNodes(
-            {elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
-      LogCannotQuantizeOp(elementwise_add_op,
+            {elementwise_x, elementwise_y, elementwise_out})) {
+      LogCannotQuantizeOp(elementwise_op,
                           "No scale available for the operator");
       return;
     }
 
     bool is_x_unsigned{false}, is_y_unsigned{false};
-    auto input_x_scale =
-        GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
-    auto input_y_scale =
-        GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
+    auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned);
+    auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned);
 
     // TODO(sfraczek): add support for different signness
     if (is_x_unsigned != is_y_unsigned) {
-      LogCannotQuantizeOp(elementwise_add_op,
-                          "ElementwiseAdd inputs must be of the same type.");
+      LogCannotQuantizeOp(elementwise_op,
+                          "Elementwise inputs must be of the same type.");
       return;
     }
 
-    QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
+    QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale,
                   is_x_unsigned, "Scale_x");
-    QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
+    QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale,
                   is_y_unsigned, "Scale_y");
 
     bool is_output_unsigned{false};
     auto output_scale =
-        GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+        GetScaleValueForNode(elementwise_out, &is_output_unsigned);
 
-    DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
-                     output_scale, is_output_unsigned, "Scale_out");
+    DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale,
+                     is_output_unsigned, "Scale_out");
 
-    ++quantize_elementwise_add_count;
+    ++quantize_elementwise_count;
   };
   gpd(graph, handler);
-  AddStatis(quantize_elementwise_add_count);
+  AddStatis(quantize_elementwise_count);
 
-  PrettyLogDetail("---    quantized %d elementwise_add ops",
-                  quantize_elementwise_add_count);
+  PrettyLogDetail("---    quantized %d %s ops", quantize_elementwise_count,
+                  elementwise_type);
 }
 
 void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
@@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFc(graph);
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
-  QuantizeElementwiseAdd(graph);
+  QuantizeElementwise(graph, "elementwise_add");
+  QuantizeElementwise(graph, "elementwise_mul");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 412c4e40a01d50b73f72076f3a0424081d633247..3a286264e41ffe1c329ba3971d777ce4fbc05b5e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeTranspose(Graph* graph) const;
   void QuantizeReshape(Graph* graph) const;
   void QuantizeMatmul(Graph* graph) const;
-  void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeElementwise(Graph* graph,
+                           const std::string elementwise_type) const;
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 889417b78c8641060b8ad89219749d8400558c6a..22000865948d629a5933ad0319e41dab71433fff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
               scale);
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
-  } else if (type == "matmul" || type == "elementwise_add") {
+  } else if (type == "matmul" || type == "elementwise_add" ||
+             type == "elementwise_mul") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
            expected_operators, added_nodes, 1.0f);
 }
 
-static const std::initializer_list<std::string> variable_names_elementwise_add =
-    {"a", "b", "c", "d", "e", "f"};
+static const std::initializer_list<std::string> variable_names_elementwise = {
+    "a", "b", "c", "d", "e", "f"};
 
-ProgramDesc BuildProgramDescElementwiseAdd() {
+ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
+                                        const std::string elementwise_name) {
   ProgramDesc prog;
-  for (auto& v : variable_names_elementwise_add) {
+  for (auto& v : variable_names_elementwise) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+  SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true,
         "int8");
   SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
+void TestElementwise(const std::string elementwise_type,
+                     const std::string elementwise_name) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, SCALE * S8_MAX);
+      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
+                                       const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "e");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "e");
 }
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
+                                           const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "", "b");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "", "b");
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  TestElementwise("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_mul) {
+  TestElementwise("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
 }
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 5f74b61ee86aad10880f3a67d8250026a6e9ac18..3b883dac9782af8350b3e22d2954e21789a1a120 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
-          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
-           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
-           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
-           "multi_gru", "slice"});
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
+           "elementwise_mul", "fc", "matmul", "nearest_interp",
+           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
+           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b7f7a8071d21413f45d86e98b8649a3aaba5d2f5
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const {
+  std::vector<std::string> act_types = {
+      "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt",
+      "abs",  "clip", "gelu",       "relu6", "sigmoid"};
+  std::vector<std::string> elt_types = {"elementwise_add", "elementwise_sub",
+                                        "elementwise_mul"};
+
+  for (const auto &elt_type : elt_types)
+    for (const auto &act_type : act_types) {
+      std::unordered_map<std::string, std::string> attr_map;
+
+      if (act_type == "swish")
+        attr_map.emplace("beta", "activation_alpha");
+      else if (act_type == "relu6")
+        attr_map.emplace("threshold", "activation_alpha");
+      else if (act_type == "clip") {
+        attr_map.emplace("min", "activation_alpha");
+        attr_map.emplace("max", "activation_beta");
+      } else {
+        attr_map.emplace("alpha", "activation_alpha");
+        attr_map.emplace("beta", "activation_beta");
+      }
+      FuseElementwiseAct(graph, elt_type, act_type, attr_map);
+    }
+}
+
+void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
+    Graph *graph, const std::string &elt_type, const std::string &act_type,
+    const std::unordered_map<std::string, std::string> &attr_map) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init("elementwise_act", graph);
+
+  GraphPatternDetector gpd;
+  auto *elementwise_input = gpd.mutable_pattern()
+                                ->NewNode(elt_type + "_act/elementwise_input")
+                                ->AsInput()
+                                ->assert_is_op_input(elt_type, "X");
+  patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(),
+                                                          elt_type + "_act");
+  elementwise_act_pattern(elementwise_input, elt_type, act_type);
+
+  int found_elementwise_activation_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Fuse " << elt_type << " with activation op.";
+    // Elementwise output
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
+                              elementwise_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise,
+                              elementwise_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern);
+
+    auto *elementwise_op = elementwise->Op();
+
+    if (elementwise_op->HasAttr("use_mkldnn")) {
+      const std::string wo_elt_type =
+          "The " + elt_type;  // Workaround for PP error message checking.
+      PADDLE_ENFORCE_EQ(
+          BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true,
+          platform::errors::PreconditionNotMet(
+              wo_elt_type + "+Act fusion may happen only when oneDNN library "
+                            "is used."));
+    }
+
+    auto *activation_op = activation->Op();
+    for (const auto &attr : attr_map) {
+      if (activation_op->HasAttr(attr.first)) {
+        elementwise_op->SetAttr(attr.second,
+                                activation_op->GetAttr(attr.first));
+      }
+    }
+
+    if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
+        BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
+      elementwise_op->SetAttr("activation_type", std::string("gelu_tanh"));
+    else
+      elementwise_op->SetAttr("activation_type", act_type);
+
+    elementwise_op->SetOutput("Out", {activation_out->Name()});
+
+    IR_OP_VAR_LINK(elementwise, activation_out);
+    GraphSafeRemoveNodes(g, {activation, elementwise_out});
+    found_elementwise_activation_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_elementwise_activation_count);
+  PrettyLogDetail("---    fused %d %s with %s activation",
+                  found_elementwise_activation_count, elt_type, act_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(elt_act_mkldnn_fuse_pass,
+              paddle::framework::ir::ElementwiseActivationOneDNNPass);
+REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .LE("elementwise_sub", 1)
+            .LE("elementwise_mul", 1)
+            .LE("relu", 0)
+            .LE("tanh", 0)
+            .LE("leaky_relu", 1)
+            .LE("swish", 0)
+            .LE("hard_swish", 0)
+            .LE("sqrt", 0)
+            .LE("abs", 0)
+            .LE("clip", 1)
+            .LE("gelu", 0)
+            .LE("relu6", 0)
+            .LE("sigmoid", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8b7d06a828508e9773301bfc602e01f9354eac4
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * \brief   Fuse the Elementwise and activation operators into single
+ * OneDNN's Elementwise with post-op.
+ */
+class ElementwiseActivationOneDNNPass : public FusePassBase {
+ public:
+  virtual ~ElementwiseActivationOneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+  void FuseElementwiseAct(
+      Graph *graph, const std::string &elt_types, const std::string &act_types,
+      const std::unordered_map<std::string, std::string> &attr_map) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 11190309814e7c75777a6cddd7e4d24bfc7ba9e6..bf2cf58f970addf1dac9f4871ba4abe09c3c7b38 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -32,8 +32,9 @@ USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index d578ada0db00fed85f7b4f25f1483169c72c2c0b..7df957b2c0eca64bacd1b48065f37ddffec1770a 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 
 #include <boost/logic/tribool.hpp>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -25,12 +26,13 @@ USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_ITSELF(relu);
-USE_OP(tanh);
+USE_OP_ITSELF(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 7e61d6ae4248b3f41fd950fcf80e0306bd0971bb..8c51c278d4872bd5b0b019223fb0e778df390732 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -125,6 +125,7 @@ class Node {
   // Only use this for auto parallel.
   // A node does not have original desc if the return is zero.
   uint64_t OriginalDescId() const { return original_desc_id_; }
+  int GraphId() const { return graph_id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
   bool IsVar() const { return type_ == Type::kVariable; }
@@ -246,10 +247,12 @@ class Node {
   // Store the original id of var desc or op desc.
   // Only use this for auto parallel.
   uint64_t original_desc_id_{0};
+  int graph_id_{-1};
 
  private:
   // ID can only set by a Graph.
   void SetId(int id) { id_ = id; }
+  void SetGraphId(int graph_id) { graph_id_ = graph_id; }
 
   // desc_order can only set by a Graph when constructing a Graph from a
   // BlockDesc.
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ece48158586404cfe0c956fd66a20dc0db1ce96e..f30d1ea1b83dde65bdb703d511d0246fe4886113 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ops_, place_);
 #endif
   platform::ScopedFlushDenormal flush;
   for (auto &op : ops_) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 219aae71127ed8963b4bfe4e8ee5e7259dbf7d02..7fe1852f7396cb8cebe4b83f4cc80a8023421351 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -31,14 +31,14 @@ USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
-USE_OP(reduce_mean_grad);
+USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);
@@ -47,8 +47,8 @@ USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
-USE_OP(sigmoid_grad);
-USE_OP(tanh_grad);
+USE_OP_ITSELF(sigmoid_grad);
+USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f8e30c1ee294ecf692e2992b6123232ba1c8bd7d..42fbeb5d29ce4ac3a1498704b1fff88570c9c092 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -628,10 +628,12 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
 
 bool OpSupportGPU(const std::string& op_type) {
   // check in new Function kernel first
+  bool has_phi_kernel = false;
   auto& kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
       kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   for (auto& kernel : kernel_key_map) {
+    has_phi_kernel = true;
     if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       return true;
     }
@@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) {
 
   auto& all_kernels = OperatorWithKernel::AllOpKernels();
   auto it = all_kernels.find(op_type);
-  if (it == all_kernels.end()) {
-    // All control operator must support GPU
-    return true;
-  }
-  for (auto& kern_pair : it->second) {
-    if (platform::is_gpu_place(kern_pair.first.place_)) {
+  if (it != all_kernels.end()) {
+    for (auto& kern_pair : it->second) {
+      if (platform::is_gpu_place(kern_pair.first.place_)) {
+        return true;
+      }
+    }
+  } else {
+    if (has_phi_kernel) {
+      // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel,
+      // this op doesn't support GPU
+      return false;
+    } else {
+      // All control operator must support GPU
       return true;
     }
   }
@@ -1456,7 +1465,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-#ifdef PADDLE_WITH_XPU
+
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
@@ -1470,17 +1480,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(type_);
-  if (platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.library_type_ = LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << type_
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << type_
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    bool is_xpu_unsupport =
+        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(type_));
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << type_
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
@@ -2083,16 +2112,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto* var = ins_vector[offset];
       if (var->IsType<framework::LoDTensor>()) {
         tensor_in = &(var->Get<framework::LoDTensor>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<phi::SelectedRows>()) {
         tensor_in = &(var->Get<phi::SelectedRows>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
+    // Note: here cannot deal with vector<LoDTensorArray> input
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done inputs";
@@ -2120,22 +2158,33 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-
       if (var) {
         if (var->template IsType<framework::LoDTensor>()) {
           tensor_out = var->template GetMutable<framework::LoDTensor>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          // Note: If the input LoDTensorArray size is 0, the output
+          // LoDTensorArray is also 0
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
-
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done outputs";
@@ -2250,42 +2299,67 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
-      auto& attr = Attrs().at(attr_names[i]);
+      auto attr_it = attrs_.find(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        if (attr_it == attrs_.end()) {
+          auto in_it = ctx.inputs.find(attr_names[i]);
+          if (in_it != ctx.inputs.end()) {
+            // get data from input
+            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
+            int32_t val_int = val.template to<int32_t>();
+            pt_kernel_context->EmplaceBackAttr(val_int);
+          } else {
+            PADDLE_THROW(platform::errors::NotFound(
+                "can not find attribute `%s` both in attribute and input ",
+                attr_names[i]));
+          }
+        } else {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(int, attr_it->second));
+        }
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(float, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(bool, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(int64_t, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::string, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
         auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
+                BOOST_GET_CONST(int, attr_it->second)));
         pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
+        if (std::type_index(attr_it->second.type()) ==
             std::type_index(typeid(std::vector<int64_t>))) {
           pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        } else if (std::type_index(attr.type()) ==
+              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
+        } else if (std::type_index(attr_it->second.type()) ==
                    std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const auto& vector_int_attr =
+              BOOST_GET_CONST(std::vector<int>, attr_it->second);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+        const auto& vector_int_attr =
+            BOOST_GET_CONST(std::vector<int>, attr_it->second);
         pt_kernel_context->EmplaceBackAttr(vector_int_attr);
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr_it->second));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1a1171f1dba4d794796ef1421fe386f60a0e587d..6f68c261d2b24dd66a70734d29d448e8927631e9 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.InputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::LoDTensorArray>();
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
   }
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index 23cb653fef22ac966655e5650d20c128e2bd3cdd..7a7a7b2798f5920f89e15222959a935da9af2c25 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -45,8 +45,8 @@ Program CreateAddProgram() {
   NetBuilder builder("net_builder");
   auto a = builder.CreateInput(Float(32), {M, N});
   auto b = builder.CreateInput(Float(32), {M, N});
-  auto c = builder.add(a, b);
-  auto d = builder.add(a, c);
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
   auto program = builder.Build();
 
   return program;
@@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) {
   auto w = builder.CreateInput(Float(32), {N, K}, "W");  // weight
   auto b = builder.CreateInput(Float(32), {N}, "B");     // bias
 
-  auto mul_out = builder.mul(a, w, 2, 1);
-  auto add_out = builder.add(mul_out, b);
+  auto mul_out = builder.Mul(a, w, 2, 1);
+  auto add_out = builder.Add(mul_out, b);
   auto program = builder.Build();
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 10ceae62dccbbab9329b73e0f581b51508511194..5de861235461ff6670503f6372961bdcf0be5ec2 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
   proto::VarType::TensorDesc desc;
   {  // int32_t size
      // proto buffer
-    int32_t size;
+    int32_t size = -1;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
+                                           "Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
+                                   "Tensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 149202468be6c6bec833f100adfd4100c520f8f3..7d60b7d26f3fbceaca9b19995ff2c5d29ad426b8 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                 unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
 #elif defined(PADDLE_WITH_ASCEND_CL)
   auto unsupported_ops_npu_fp16 = std::get<2>(
       OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                 unsupported_ops_xpu_bf16.end());
+#elif defined(PADDLE_WITH_MLU)
+  auto unsupported_ops_mlu_fp16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
+                                unsupported_ops_mlu_fp16.end());
+  auto unsupported_ops_mlu_bf16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
+  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
+                                unsupported_ops_mlu_bf16.end());
 #endif
   VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
           << unsupported_fp16_ops_->size() << " "
@@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
   auto data_type = GetDataType<VarType>(var);
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
     if (data_type == paddle::framework::proto::VarType::FP32 ||
         data_type == paddle::framework::proto::VarType::FP16 ||
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 7416d206fc43eaf5a56c3eb606bb0672d1172c0b..d7478b18dba0616fdc995866d8892c7c052a0e35 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type,
 }
 
 void BasicEngine::Execute() {
+  platform::RecordEvent backward_record_event(
+      "backward", platform::TracerEventType::Operator, 1);
+
   if (init_nodes_.empty()) {
     return;
   }
@@ -412,7 +415,7 @@ void BasicEngine::Execute() {
 
     for (auto& cur_op : *shared_cur_node) {
       platform::RecordEvent op_type_record_event(
-          cur_op.Type(), platform::TracerEventType::Operator, 1);
+          cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1);
 
       ++op_num;
 
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 12aa13bbacc3bae5d690323f45817f95762c376c..499cf4d8ad6d82dd554fa4f5bbcf39833fed0eab 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
   if (data_type == framework::proto::VarType::BF16) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
 #else
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index bae49fb381a475dd8227d1dc855a6db28c9cd273..a427b9b8199116098d149689961cedf14e86e5e1 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
@@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
-
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  expected_kernel_key.place_ = platform::XPUPlace();
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
-  if (use_xpu_kp_kernel_rt) {
-    VLOG(3) << "xpu_kp using rt mode ";
-  }
-  if (use_xpu_kp_kernel_debug) {
-    VLOG(3) << "xpu_kp using debug mode ";
-  }
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.place_ = platform::XPUPlace();
-    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << op.Type()
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << op.Type()
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d7c0c8cc547e6b04f67ddbb06121d139756d5142..9daac181d57de63a85116d176a286a9be9b3d4c7 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
 
-    if ((it == ins.end()) &&
-        (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
-      kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
-      auto end_idx = start_idx + 1;
-      kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
-      continue;
+    if (it == ins.end()) {
+      if (LIKELY(input_defs[i].type_index ==
+                 std::type_index(
+                     typeid(paddle::optional<const phi::DenseTensor&>)))) {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
+        auto end_idx = start_idx + 1;
+        kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+        continue;
+      } else {
+        PADDLE_THROW(phi::errors::NotFound(
+            "Can not find input variable '%s' for %s OP, please check whether "
+            "the name setting in OpArgumentMapping is consistent with that in "
+            "OpMaker.",
+            input_names[i], pt_kernel_signature.name));
+      }
     }
+
     auto ins_vector = it->second;
     size_t end_idx = start_idx + ins_vector.size();
 
@@ -280,14 +289,23 @@ void BuildDygraphPhiKernelContext(
       auto& var = ins_vector[offset]->Var();
       if (var.template IsType<phi::DenseTensor>()) {
         tensor_in = &(var.template Get<phi::DenseTensor>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var.template IsType<phi::SelectedRows>()) {
         tensor_in = &(var.template Get<phi::SelectedRows>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var.template IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var.template Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var.Type())));
       }
-      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -317,22 +335,32 @@ void BuildDygraphPhiKernelContext(
       if (var) {
         if (var->template IsType<phi::DenseTensor>()) {
           tensor_out = var->template GetMutable<phi::DenseTensor>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -410,6 +438,17 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (ins.find(attr_names[i]) != ins.end()) {
+      // deal tensor attr here
+      auto& ins_vector = ins.at(attr_names[i]);
+      auto tensor_attr =
+          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        int val = tensor_attr.template to<int>();
+        kernel_ctx->EmplaceBackAttr(val);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
+      }
     } else if (attr_defs[i].type_index ==
                std::type_index(typeid(std::vector<phi::Scalar>))) {
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -466,6 +505,7 @@ void BuildDygraphPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
+
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
@@ -501,6 +541,10 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int>))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        kernel_ctx->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 2e38bd77cf63cc85b75a50e62250a6e746f525bc..f754c6fdd0ee7742f0e544baad0225502c172848 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -34,6 +34,7 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 #endif
 
 namespace imperative = paddle::imperative;
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 01c9d2847e0c850fd4159613a47d647bdbf46c31..d18c8e96c49b6a993fbd0a8d632212ae8d7f8c6d 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 1);
+      type + " trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
     program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
-    PADDLE_ENFORCE_EQ(
-        passed_default_attrs_, nullptr,
-        paddle::platform::errors::PermissionDenied(
-            "We expect passed_default_attrs_ is nullptr while "
-            "use_default_attr_map is true, however we got not null "
-            "passed_default_attrs_. Please check your usage of trace_op. "));
-    CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                     inplace_map);
-  } else {
-    VLOG(3) << "No Grad to track for Op: " << type;
+  {
+    platform::RecordEvent node_creation_record_event(
+        type + " node_creation", platform::TracerEventType::Operator, 1);
+
+    if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+      PADDLE_ENFORCE_EQ(
+          passed_default_attrs_, nullptr,
+          paddle::platform::errors::PermissionDenied(
+              "We expect passed_default_attrs_ is nullptr while "
+              "use_default_attr_map is true, however we got not null "
+              "passed_default_attrs_. Please check your usage of trace_op. "));
+      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
+                       inplace_map);
+    } else {
+      VLOG(3) << "No Grad to track for Op: " << type;
+    }
+    VLOG(6) << "Finish Trace Op: " << type;
   }
-  VLOG(6) << "Finish Trace Op: " << type;
 }
 
 template void Tracer::TraceOp<VarBase>(
@@ -385,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
 }
 
 phi::KernelSignature Tracer::GetExpectedKernelSignature(
-    const std::string& type, const NameVarBaseMap& ins,
-    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+    const std::string& type, const NameTensorMap& ins,
+    const NameTensorMap& outs, framework::AttributeMap attrs) const {
   auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
   framework::RuntimeContext ctx({}, {});
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -401,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature(
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
   auto dygraph_exe_ctx =
-      imperative::DygraphExecutionContext<imperative::VarBase>(
+      imperative::DygraphExecutionContext<egr::EagerVariable>(
           *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
           default_attrs);
   auto* opbase_with_kernel =
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7..f24961885c9b85b03c561f60f375b1a21bf086dd 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -156,8 +156,8 @@ class Tracer {
   }
 
   phi::KernelSignature GetExpectedKernelSignature(
-      const std::string& type, const NameVarBaseMap& ins,
-      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+      const std::string& type, const NameTensorMap& ins,
+      const NameTensorMap& outs, framework::AttributeMap attrs) const;
 
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a5c32164bf1a28687ea6f8cc53427db67560c307..74e8ca3f229c6b7093e29cb53c0ce15e0b15d6a9 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -188,6 +188,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
+  DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
+  DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
+                      std::unordered_set<std::string>);
 
   // Usually use for trt dynamic shape.
   // TRT will select the best kernel according to opt shape
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 796c86a3ad1efe45dd8a00139b92c2642676a811..287c896e49bf254d70a5c79c818a39f913472f2f 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->dlnne_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "mixed_precision_configure_pass") {
+      pass->Set("gpu_fp16_disabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->gpu_fp16_disabled_op_types()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index daa18d8c78bf875ebcc6571bf955a7f634948e4f..614eea24a0e2ee9d4fabd68a9374fa7c44b63ad7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
 
 #else
 
+void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
+    const framework::ir::Graph &graph,
+    std::unordered_map<std::string, std::string> *var_name_op_type_map) {
+  std::vector<framework::ir::Node *> node_list =
+      framework::ir::TopologyVarientSort(
+          graph, static_cast<framework::ir::SortKind>(0));
+  for (auto *op_node : node_list) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    for (auto *pre_node : op_node->inputs) {
+      if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
+        var_name_op_type_map->insert(std::pair<std::string, std::string>(
+            pre_node->Var()->Name(), op_node->Op()->Type()));
+      }
+    }
+  }
+}
+
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
+
+  bool mixed_precision_mode =
+      argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
+  std::unordered_map<std::string, std::string> var_name_op_type_map{};
+  std::unordered_set<std::string> blacklist{};
+  if (mixed_precision_mode) {
+    GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
+    blacklist = argument->gpu_fp16_disabled_op_types();
+  }
+
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
@@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
 
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      // Copy the parameter data to a tmp tensor.
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      // Reallocation the space on GPU
-      t->clear();
-
-      // Copy parameter data to newly allocated GPU space.
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
+      bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
+                      t->dtype() == paddle::experimental::DataType::FLOAT64;
+      if (mixed_precision_mode &&
+          !blacklist.count(var_name_op_type_map[var_name]) && is_float) {
+        framework::Tensor half_tensor;
+        half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
+        half_tensor.Resize(t->dims());
+        auto *half_data =
+            half_tensor.mutable_data<float16>(platform::CPUPlace());
+        for (int i = 0; i < t->numel(); i++) {
+          auto *data = t->mutable_data<float>(platform::CPUPlace());
+          half_data[i] = static_cast<float16>(data[i]);
+        }
+        t->clear();
+        paddle::framework::TensorCopySync(half_tensor, place, t);
+      } else {
+        platform::CPUPlace cpu_place;
+        framework::LoDTensor temp_tensor;
+        temp_tensor.Resize(t->dims());
+        paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+        t->clear();
+        paddle::framework::TensorCopySync(temp_tensor, place, t);
+      }
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index d5e98ec886e65f829a1496b1431f23aad6c4bc4c..f8209f051d53444435ed8c65b400f08bf8627553 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_ASCEND_CL
   void CopyParamsToNpu(Argument *argument);
 #else
-  void CopyParamsToGpu(Argument *argument);
+
+  void GetVarNameToOpTypeMap(
+      const framework::ir::Graph& graph,
+      std::unordered_map<std::string, std::string>* var_name_op_type_map);
+
+  void CopyParamsToGpu(Argument* argument);
 #endif
 };
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 41c01d3b7e261314d8dc6b852f5b2a597421fe48..d08d28a3f623389790e63d45e13584a8d0db6adc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 
   Update();
 }
+
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 
   Update();
 }
+
 void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
   Update();
 }
 
+void AnalysisConfig::Exp_EnableUseGpuFp16(
+    std::unordered_set<std::string> op_list) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  use_gpu_fp16_ = true;
+  gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
+#else
+  LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
+  use_gpu_fp16_ = false;
+#endif
+
+  Update();
+}
+
 void AnalysisConfig::DisableFCPadding() {
   use_fc_padding_ = false;
 
@@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(use_gpu_fp16_);
+  CP_MEMBER(gpu_fp16_disabled_op_types_);
 
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
@@ -573,6 +590,20 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_gpu_fp16_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
+                    "enabled.";
+    } else if (!use_gpu()) {
+      LOG(ERROR)
+          << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
+    } else {
+      pass_builder()->Exp_EnableUseGpuFp16();
+    }
+#endif
+  }
+
   if (use_mkldnn_) {
 #ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
@@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
+  ss << use_gpu_fp16_;
+  for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
   ss << use_fc_padding_;
   ss << gpu_device_id_;
   ss << xpu_device_id_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 871ed596a3ee9d6362b03e99ca10313765826a51..a7caa3e369f80a954f36226c070ff1f7bd822a2b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -50,8 +50,7 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/string/split.h"
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -374,8 +373,7 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
     return PrepareFleetExecutor();
@@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool AnalysisPredictor::PrepareFleetExecutor() {
   VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
   if (config_.dist_config().nranks() > 1 && !CommInit()) {
@@ -872,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
   }
 
+  if (config_.gpu_fp16_enabled()) {
+    argument_.SetUseGPUFp16(true);
+    argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -1189,8 +1191,7 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1239,8 +1240,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1287,8 +1287,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "ZeroCopyRun will use the fleet executor.";
     inference::Timer timer;
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 21a7e9658bbeeb16d4cbff6364aaef68edcae16d..d9992f3fbef9d6ed626410ae5b9fc881b0772aa8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #endif
 #include "paddle/fluid/framework/naive_executor.h"
@@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
 
   ///
@@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   static int clone_num_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet executor related
   distributed::FleetExecutorDesc executor_desc_;
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d..ecb5eaf982548c44eb97fde7e2b7365c9b0e9fc2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
   ASSERT_TRUE(!config.use_onnxruntime());
 }
 
+TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
+  AnalysisConfig config;
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+  ASSERT_TRUE(config.gpu_fp16_enabled());
+#else
+  config.DisableGpu();
+#endif
+  LOG(INFO) << config.Summary();
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) {
   auto predictor = CreatePredictor(config);
 }
 
+TEST(Predictor, Exp_EnableUseGpuFp16) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+#else
+  config.DisableGpu();
+#endif
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 79a31555c7f0b1cb4a8d9c48bae16145d605935b..2c0945cd5b386a003ce63c86f3feb52213b378ba 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -53,7 +53,11 @@ if [ $7 == ON ]; then
   if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
     echo "MobileNetV2.inference.model.tar.gz has been downloaded."
   else
-    wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    if [ $WIN_DETECT != "" ]; then
+      wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    else
+      wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    fi
     tar xzf *.tar.gz
   fi
   cd ..
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 4341fb0a9ccd8822151d4660f5a0c22901e47122..b2cfb060dd32559f6157fc456c7399736fc9fe51 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -14,7 +14,11 @@
 #
 
 cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+if (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
+else (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+endif (WITH_ONNXRUNTIME)
 cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
 
 cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 18b1d09f0e8a7c4be9862991060a4706ee7cde7e..66dec0157d98e776b38ec8af81a0c006bc732bf4 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -22,12 +22,22 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/allocator.h"
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
 
 namespace paddle_infer {
 
 using float16 = paddle::platform::float16;
 
 void Tensor::Reshape(const std::vector<int> &shape) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    shape_.assign(shape.begin(), shape.end());
+    return;
+  }
+#endif
+
   PADDLE_ENFORCE_EQ(
       name_.empty(), false,
       paddle::platform::errors::PreconditionNotMet(
@@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const {
 }
 
 DataType Tensor::type() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return dtype_;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
   if (type == paddle::framework::proto::VarType::FP32) {
@@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; }
 
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyFromCpu<T>(data);
+    return;
+  }
+#endif
+
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     paddle::platform::errors::PreconditionNotMet(
@@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 
 template <typename T>
 void Tensor::CopyToCpu(T *data) const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyToCpu<T>(data);
+    return;
+  }
+#endif
+
   CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
 }
 
@@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
-Tensor::Tensor(void *scope) : scope_{scope} {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "The `scope` can not be nullptr. It should be "
-                              "set to the pointer of scope."));
-}
+Tensor::Tensor(void *scope) : scope_{scope} {}
 
 template <typename T>
 void *Tensor::FindTensor() const {
@@ -513,6 +537,26 @@ void *Tensor::FindTensor() const {
 }
 
 std::vector<int> Tensor::shape() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    std::vector<int> shape;
+    // input handle
+    if (idx_ < 0) {
+      shape.assign(shape_.begin(), shape_.end());
+    } else {  // output handle
+      auto binding = binding_.lock();
+      PADDLE_ENFORCE_NOT_NULL(binding,
+                              paddle::platform::errors::PreconditionNotMet(
+                                  "output tensor [%s] no binding ptr", name_));
+      std::vector<Ort::Value> outputs = binding->GetOutputValues();
+      Ort::Value &value = outputs[idx_];
+      auto info = value.GetTensorTypeAndShapeInfo();
+      auto ort_shape = info.GetShape();
+      shape.assign(ort_shape.begin(), ort_shape.end());
+    }
+    return shape;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_NOT_NULL(
       tensor_, paddle::platform::errors::PreconditionNotMet(
@@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) {
   device_ = device;
 }
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }
+
+void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
+  binding_ = binding;
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+                                         shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                                          shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                                  size * sizeof(float16), shape, shape_len,
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+}
+
+template <typename T>
+void Tensor::ORTCopyFromCpu(const T *data) {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "input tensor [%s] no binding ptr", name_));
+  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+                              OrtMemTypeDefault);
+  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+                                std::multiplies<size_t>());
+  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
+                               shape_.data(), shape_.size());
+  binding->BindInput(name_.c_str(), ort_value);
+}
+
+template <typename T>
+void Tensor::ORTCopyToCpu(T *data) const {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  auto info = value.GetTensorTypeAndShapeInfo();
+  size_t size = info.GetElementCount() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
+  } else {
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data),
+                         paddle::platform::CUDAPlace(device_),
+                         value.GetTensorData<void>(), size, nullptr);
+  }
+}
+
+template void Tensor::ORTCopyFromCpu<float>(const float *data);
+template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
+template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
+template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
+template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
+template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
+
+template void Tensor::ORTCopyToCpu<float>(float *data) const;
+template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
+template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
+template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
+template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index ee82da139d8f39c26002763c4a4835050c48fc99..bd9de252a0962bc27a23b949b428d8f18f96190f 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -25,11 +25,7 @@
 #include <vector>
 
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -45,24 +41,23 @@
 
 namespace paddle {
 
-framework::proto::VarType::Type ConvertONNXType(
-    ONNXTensorElementDataType type) {
+paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) {
   switch (type) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return framework::proto::VarType::FP32;
-    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-    //   return DataType::FP16;
+      return paddle_infer::DataType::FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return paddle_infer::DataType::FLOAT16;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-      return framework::proto::VarType::INT8;
+      return paddle_infer::DataType::INT8;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return framework::proto::VarType::INT32;
+      return paddle_infer::DataType::INT32;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return framework::proto::VarType::INT64;
+      return paddle_infer::DataType::INT64;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-      return framework::proto::VarType::UINT8;
+      return paddle_infer::DataType::UINT8;
     default:
       LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
-      return framework::proto::VarType::FP32;
+      return paddle_infer::DataType::FLOAT32;
   }
 }
 
@@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() {
   VLOG(3) << "ONNXRuntime Predictor::init()";
 
   // Now ONNXRuntime only suuport CPU
+  const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
   if (config_.use_gpu()) {
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  scope_.reset(new paddle::framework::Scope());
-  sub_scope_ = &scope_->NewScope();
 
   std::string onnx_proto;
   paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
@@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() {
                "generated.";
   }
   session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  binding_ = std::make_shared<Ort::IoBinding>(session_);
 
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
   Ort::Allocator allocator(session_, memory_info);
 
-  framework::proto::VarType::Type proto_type =
-      framework::proto::VarType::LOD_TENSOR;
   size_t n_inputs = session_.GetInputCount();
   for (size_t i = 0; i < n_inputs; ++i) {
     auto input_name = session_.GetInputName(i, allocator);
@@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
-    auto *ptr = scope_->Var(input_name);
-    framework::InitializeVariable(ptr, proto_type);
     allocator.Free(input_name);
   }
 
@@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
-    auto *ptr = scope_->Var(output_name);
-    framework::InitializeVariable(ptr, proto_type);
+
+    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+                                    place_.GetDeviceId(), OrtMemTypeDefault);
+    binding_->BindOutput(output_name, out_memory_info);
+
     allocator.Free(output_name);
   }
-
   return true;
 }
 
@@ -216,15 +209,26 @@ std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
   return output_names;
 }
 
+bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
+                                        bool is_input) {
+  if (is_input) {
+    for (auto i : input_desc_)
+      if (i.name == name) return true;
+  } else {
+    for (auto i : output_desc_)
+      if (i.name == name) return true;
+  }
+  return false;
+}
+
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The in variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+                    platform::errors::PreconditionNotMet(
+                        "The in variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -233,18 +237,19 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
   return res;
 }
 
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The out variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+                    platform::errors::PreconditionNotMet(
+                        "The out variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -253,46 +258,18 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  int size = output_desc_.size();
+  for (int i = 0; i < size; ++i)
+    if (output_desc_[i].name == name) {
+      res->idx_ = i;
+      res->dtype_ = ConvertONNXType(output_desc_[i].dtype);
+      break;
+    }
   return res;
 }
 
-Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
-                                             const char *device_name) {
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                              place_.GetDeviceId(), OrtMemTypeDefault);
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  size_t size =
-      tensor->numel() *
-      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
-  return Ort::Value::CreateTensor(memory_info,
-                                  static_cast<void *>(tensor->data()), size,
-                                  shape.data(), shape.size(), desc.dtype);
-}
-
-void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
-                                    const ONNXDesc &desc) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize(phi::make_ddim(info.GetShape()));
-  auto dtype = ConvertONNXType(info.GetElementType());
-  auto *ptr = tensor->mutable_data(place_, dtype);
-
-  if (platform::is_cpu_place(place_)) {
-    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
-                tensor->numel() * framework::SizeOfType(dtype));
-  } else {
-    auto src_place = place_;
-    auto dst_place = place_;
-    memory::Copy(dst_place, ptr, src_place,
-                 const_cast<void *>(value.GetTensorData<void>()),
-                 tensor->numel() * framework::SizeOfType(dtype));
-  }
-}
-
 bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
@@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
 
 bool ONNXRuntimePredictor::ZeroCopyRun() {
   try {
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_desc_.size());
-    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
-    for (auto desc : input_desc_) {
-      inputs.push_back(GetOrtValue(desc, device_name));
-      binding.BindInput(desc.name.c_str(), inputs.back());
-    }
-
-    // TODO(heliqi): Optimization —— move to  Init()
-    for (auto desc : output_desc_) {
-      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                                  place_.GetDeviceId(), OrtMemTypeDefault);
-      binding.BindOutput(desc.name.c_str(), memory_info);
-    }
-
-    session_.Run({}, binding);
-
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_desc_.size(); ++i) {
-      AsTensor(outputs[i], output_desc_[i]);
-    }
+    session_.Run({}, *(binding_.get()));
   } catch (const std::exception &e) {
     LOG(ERROR) << e.what();
     return false;
@@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
 }
 
 ONNXRuntimePredictor::~ONNXRuntimePredictor() {
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
+  binding_->ClearBoundInputs();
+  binding_->ClearBoundOutputs();
+
   memory::Release(place_);
 }
 
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 7fb07aa97bd2746773192456ddeba941a24e8906..d01756e4b96b132e3f9c3815e96f612433616ff2 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   /// \param[in] AnalysisConfig config
   ///
   explicit ONNXRuntimePredictor(const AnalysisConfig &config)
-      : config_(config) {
+      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
     predictor_id_ = inference::GetUniqueId();
-    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
   }
   ///
   /// \brief Destroy the ONNXRuntime Predictor object
@@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  std::shared_ptr<framework::Scope> scope_;
-
  private:
   ///
-  /// \brief get the Ort Value(input Tensor).
-  ///
-  /// \param[in] desc ONNXDesce(name、shape、dtype)
-  ///
-  /// \param[in] device_name "cpu" or "gpu" of device
-  ///
-  /// \return get a Ort::Value
-  ///
-  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
-
-  ///
-  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  /// \brief Whether to find in/out by name.
   ///
-  /// \param[in] value Ort::Value(output Tensor)
+  /// \param[in] name input or output name
   ///
-  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  /// \param[in] is_input input(true) or output(false)
   ///
-  /// \return get a Ort::Value
+  /// \return Whether to find by name
   ///
-  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+  bool FindONNXDesc(const std::string &name, bool is_input);
 
  private:
   AnalysisConfig config_;
@@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   // ONNXRuntime
   Ort::Env env_;
   Ort::Session session_{nullptr};
+  std::shared_ptr<Ort::IoBinding> binding_;
 
   platform::Place place_;
-  framework::Scope *sub_scope_{nullptr};
   std::vector<ONNXDesc> input_desc_;
   std::vector<ONNXDesc> output_desc_;
   int predictor_id_;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7b765e3fa8a24ef1b81b68da8ba12dd8e5577572..bdfe0e46e9ca4519c294a181cda6b8c4b87a6b9b 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void DisableGpu();
+  ///
+  /// \brief Enable GPU fp16 precision computation, in experimental state.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
+  ///
+  /// \brief A boolean state telling whether the GPU fp16 precision is turned
+  /// on.
+  ///
+  /// \return bool Whether the GPU fp16 precision is turned on.
+  ///
+  bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
 
   ///
   /// \brief Turn on XPU.
@@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
   bool thread_local_stream_{false};
+  bool use_gpu_fp16_{false};
+  std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
+      "conv2d_fusion", "conv2d", "roll", "strided_slice"};
 
   bool use_cudnn_{false};
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f5f36d805b43ea0815683e3b65bf157fe5beb2de..95975d8f2a892e709e5591135f96fbff07eb62e3 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
   use_cudnn_ = true;
 }
 
+void GpuPassStrategy::Exp_EnableUseGpuFp16() {
+  passes_.assign({
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        // "fc_fuse_pass",                        //
+        "fc_elementwise_layernorm_fuse_pass",  //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
+// disable the pass.
+#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+#endif
+        "conv_elementwise_add_fuse_pass",      //
+#endif                                         //
+        "transpose_flatten_concat_fuse_pass",  //
+        "mixed_precision_configure_pass",      //
+        "runtime_context_cache_pass"           //
+  });
+
+  use_gpu_fp16_ = true;
+}
+
 void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
@@ -262,6 +296,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              //  "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
+             "elt_act_mkldnn_fuse_pass",              //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
              // "mkldnn_inplace_pass",  // This pass should be activated after
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 351cf71e5ca7493928dfd81d776d847463f3b7bf..02290ed33ff1cd4f72d707d6f9d23f16e05c321b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable the use of cuDNN kernel.
   virtual void EnableCUDNN() {}
 
+  /// \brief Enable use gpu fp16 kernel.
+  virtual void Exp_EnableUseGpuFp16() {}
+
   /// \brief Enable the use of MKLDNN.
   /// The MKLDNN control exists in both CPU and GPU mode, because there can
   /// still be some CPU kernels running in GPU mode.
@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
 
+  /// \brief Check if we are using gpu fp16 kernel.
+  /// \return A bool variable implying whether we are in gpu fp16 mode.
+  bool use_gpu_fp16() const { return use_gpu_fp16_; }
+
   /// \brief Check if we are using xpu.
   /// \return A bool variable implying whether we are in xpu mode.
   bool use_xpu() const { return use_xpu_; }
@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_npu_{false};
   bool use_ipu_{false};
   bool use_mkldnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of cuDNN kernel.
   void EnableCUDNN() override;
 
+  /// \brief Enable the use of gpu fp16 kernel.
+  void Exp_EnableUseGpuFp16() override;
+
   /// \brief Not supported in GPU mode yet.
   void EnableMKLDNN() override;
 
@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  protected:
   /// \cond Protected
   bool use_cudnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 5a98d109aed79cc5bcefdc01b47a166bdf9c01d9..2afe2d32e2f60e47136b1e2f002b0e98c9b17cd2 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,11 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#endif
+
 namespace paddle_infer {
 
 /// \brief  Experimental.
@@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor {
   PlaceType place_;
   int device_;
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  bool is_ort_tensor_{false};
+  std::vector<int64_t> shape_;
+  std::weak_ptr<Ort::IoBinding> binding_;
+  int idx_{-1};
+
+  void SetOrtMark(bool is_ort_tensor);
+
+  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+
+  template <typename T>
+  void ORTCopyFromCpu(const T* data);
+
+  template <typename T>
+  void ORTCopyToCpu(T* data) const;
+#endif
+
   friend class paddle_infer::contrib::TensorUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 67e7c78b62e9d212b5c1738403361d77d7a3925b..496e8932a690dbcd87001da4f7e017fc86d6bff5 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/layer_norm_op.h"
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index fe04d552e40263a396059e3da59de4d51def67e0..7b65d2d7c97cca335f76f1d0399a25bcd8a00c92 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
index b8e87a8d94d1f43d35da1a46c300a1b37c9382ec..5a306f622adbe7a298ab53daae1168ad50b402a9 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool3d);
+USE_OP_ITSELF(pool3d);
 REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7f7313fbcb5969aafea47ad23248acd5a6ca3644..1ad82df41737c4093d0b5518c754ed85c505b8be 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace paddle
 
 USE_OP_ITSELF(relu);
-USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index 1725888abc379bfa4ffbbc5cfc4cecd1872c7c18..f17e00de0eeb7c8f4d782f0a4eaecc2fc1df268b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
 }  // namespace paddle
 
 // USE_OP(leaky_relu);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index bded833505cd25352adc4123de415613d1fc926d..36f13262a73d703a6d9776855adbab3c44075aa7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 861e98e4437564bfe5fae2a575741beb1d8823de..67d44184a76d0552b667c6d5a3d9466582e33558 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace paddle {
 namespace inference {
@@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                   cudaMemcpyHostToDevice, stream);
 
-  paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
   layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
              variance_d, begin_norm_axis, eps);
   return cudaGetLastError() != cudaSuccess;
@@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue(
     cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                     cudaMemcpyHostToDevice, stream);
 
-    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
     layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
                variance_d, begin_norm_axis, eps);
   } else {
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index 861a9aa9d000bff9e6dcc673cc5c8d99c3a7a6ec..5596a89a083fe9ff177aa9abc769b8fa27105c1f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool3d_type_ == Pool3DType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
   } else if (pool3d_type_ == Pool3DType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
@@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool3d_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
   } else if (pool3d_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 6d711c26adc6ff8e49375d15f32322303f3ae6ef..9bfe98d759d8e29bc34b42fa667e5cda5f1493de 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool_type_ == PoolType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, false, odatas[0], stream, pool_process);
   } else if (pool_type_ == PoolType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, exclusive_, adaptive_, odatas[0], stream,
@@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, false, output, stream, pool_process);
   } else if (pool_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    exclusive_, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index a7a417c29a7bdb7a47d4798246de55c0bd3536f9..f296ce96d4e5f6dca5c4ad2668eea8508b37068f 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut
 if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
+  if (WITH_GPU)
+    cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
+  endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 61e292a922f0e98a958d4fe2f8fc7850bdf47e18..4a44448dc84cf744cdf061031bdf7fae8f658c4b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -219,6 +219,12 @@ class AllocatorFacadePrivate {
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
+        InitNaiveBestFitNPUPinnedAllocator();
+#endif
 #ifdef PADDLE_WITH_XPU
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2f24d5aed1eb827b4857f5936a19b206a38c788
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <random>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+namespace {
+std::mutex ipc_mutex_;
+std::unordered_map<std::string, std::weak_ptr<void>> ipc_handle_to_baseptr_;
+}  // namespace
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
+  std::lock_guard<std::mutex> lock(ipc_mutex_);
+
+  auto iter = ipc_handle_to_baseptr_.find(handle);
+  if (iter != ipc_handle_to_baseptr_.end()) {
+    auto baseptr = iter->second.lock();
+    if (baseptr) return baseptr;
+  }
+  // The IpcMemHandle can only open once for the same handle,
+  // so here we cache it here.
+  void *baseptr = nullptr;
+  auto ipc_handle =
+      reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
+      &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  // Close ipc handle on the same device.
+  int device_id = platform::GetCurrentDeviceId();
+  // Add deleter to close ipc handle.
+  auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
+    platform::CUDADeviceGuard guard(device_id);
+    std::lock_guard<std::mutex> lock(ipc_mutex_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    ipc_handle_to_baseptr_.erase(handle);
+    VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
+            << "\t" << ptr;
+  });
+  std::weak_ptr<void> wp = sp;
+  ipc_handle_to_baseptr_.insert(iter, {handle, wp});
+
+  return sp;
+}
+
+CudaIpcAllocation::~CudaIpcAllocation() {
+  shared_ptr_.reset();
+  VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
+          << "\t" << this->ptr();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..52e3cf10ea73a787d87d19beeedcdedca1e3dd3b
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle);
+
+class CudaIpcAllocation : public Allocation {
+ public:
+  explicit CudaIpcAllocation(void *ptr, size_t size, int device_id,
+                             std::shared_ptr<void> shared_ptr)
+      : Allocation(ptr, size, platform::CUDAPlace(device_id)),
+        device_id_(std::move(device_id)),
+        shared_ptr_(std::move(shared_ptr)) {}
+
+  inline const int &device_id() const { return device_id_; }
+
+  ~CudaIpcAllocation() override;
+
+ private:
+  int device_id_;
+  std::shared_ptr<void> shared_ptr_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index acaf5d548555cc3ee69bc5a03309645006256487..25c2235cce85369babc4d601de96c7475a0b1fbd 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -29,6 +29,155 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName() {
+  static std::random_device rd;
+  std::string handle = "/paddle_";
+#ifdef _WIN32
+  handle += std::to_string(GetCurrentProcessId());
+#else
+  handle += std::to_string(getpid());
+#endif
+  handle += "_";
+  handle += std::to_string(rd());
+  return handle;
+}
+
+struct CountInfo {
+  std::atomic<int> refcount;
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **map_ptr_, int *fd_) {
+  // TODO(@ZHUI): support win32
+  int file_flags = 0;
+  int fd = -1;
+  if (flags & MAPPED_SHAREDMEM) {
+    file_flags = O_RDWR | O_CREAT;
+  } else {
+    file_flags = O_RDONLY;
+  }
+  if (flags & MAPPED_EXCLUSIVE) {
+    file_flags |= O_EXCL;
+  }
+  if (flags & MAPPED_NOCREATE) {
+    file_flags &= ~O_CREAT;
+  }
+
+  if (!(flags & MAPPED_FROMFD)) {
+    if (flags & MAPPED_SHAREDMEM) {
+      fd = shm_open(filename.c_str(), file_flags, (mode_t)0600);
+      PADDLE_ENFORCE_NE(
+          fd, -1,
+          platform::errors::Unavailable(
+              "File descriptor %s open failed, unable in read-write mode",
+              filename.c_str()));
+      VLOG(6) << "shm_open: " << filename;
+    }
+  } else {
+    fd = -1;
+  }
+
+  PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
+                    platform::errors::Unavailable(
+                        "Fruncate a file to a specified length failed!"));
+
+  if (flags & MAPPED_SHAREDMEM) {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  } else {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+  }
+
+  PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED,
+                    platform::errors::Unavailable(
+                        "Memory map failed when create shared memory."));
+
+  if (flags & MAPPED_KEEPFD) {
+    *fd_ = fd;
+  } else {
+    PADDLE_ENFORCE_NE(::close(fd), -1,
+                      platform::errors::Unavailable(
+                          "Error closing memory maped file <", filename, ">"));
+
+    *fd_ = -1;
+  }
+}
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size) {
+  int fd = -1;
+  void *base_ptr = nullptr;
+  AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+  void *aliged_base_ptr =
+      static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
+  return std::make_shared<RefcountedMemoryMapAllocation>(aliged_base_ptr, size,
+                                                         filename, flags, fd);
+}
+
+RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
+    void *ptr, size_t size, std::string ipc_name, int fd, int flags)
+    : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
+  // must reset base ptr first.
+  resetBaseptr();
+  initializeRefercount();
+}
+
+void MemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+}
+
+MemoryMapAllocation::~MemoryMapAllocation() { close(); }
+
+void RefcountedMemoryMapAllocation::incref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  ++info->refcount;
+}
+
+int RefcountedMemoryMapAllocation::decref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  return --info->refcount == 0;
+}
+
+void RefcountedMemoryMapAllocation::resetBaseptr() {
+  map_ptr_ =
+      static_cast<void *>(static_cast<char *>(map_ptr_) - mmap_alignment);
+  map_size_ = map_size_ + mmap_alignment;
+}
+
+void RefcountedMemoryMapAllocation::initializeRefercount() {
+  CountInfo *info = reinterpret_cast<CountInfo *>(map_ptr_);
+
+  if (flags_ & MAPPED_EXCLUSIVE) {
+    new (&info->refcount) std::atomic<int>(1);
+  } else {
+    info->refcount++;
+  }
+}
+
+void RefcountedMemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+  void *data = map_ptr_;
+  CountInfo *info = reinterpret_cast<CountInfo *>(data);
+  if (--info->refcount == 0) {
+    PADDLE_ENFORCE_NE(
+        shm_unlink(ipc_name_.c_str()), -1,
+        platform::errors::Unavailable(
+            "could not unlink the shared memory file ", ipc_name_));
+    VLOG(6) << "shm_unlink file: " << ipc_name_;
+  }
+
+  PADDLE_ENFORCE_NE(
+      munmap(map_ptr_, map_size_), -1,
+      platform::errors::Unavailable("could not unmap the shared memory file: ",
+                                    strerror(errno), " (", errno, ")"));
+}
+
 MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
   PADDLE_ENFORCE_NE(
       munmap(this->ptr(), this->size()), -1,
@@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() {
   /* Here we do not pay attention to the result of shm_unlink,
      because the memory mapped file may have been cleared due to the
      MemoryMapFdSet::Clear() */
+
+  // Code of DataLoader subprocess:
+  //
+  //    core._array_to_share_memory_tensor(b)
+  //    out_queue.put((idx, tensor_list, structure))
+  //    core._remove_tensor_list_mmap_fds(tensor_list)
+
+  /* If the tensor in already in the send queue, the tensor will be
+   * deconstructed by the function. If the tensor not send yet, it
+   * will be cleared by MemoryMapFdSet::Clear().
+   * If the `_remove_tensor_list_mmap_fds` have be interrupted, the
+   * tensor will be cleared by both methods.
+   * */
+
   shm_unlink(this->ipc_name().c_str());
   MemoryMapFdSet::Instance().Remove(this->ipc_name());
   VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name();
 }
 
-std::string GetIPCName() {
-  static std::random_device rd;
-  std::string handle = "/paddle_";
-#ifdef _WIN32
-  handle += std::to_string(GetCurrentProcessId());
-#else
-  handle += std::to_string(getpid());
-#endif
-  handle += "_";
-  handle += std::to_string(rd());
-  return handle;
-}
-
 std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
     size_t size) {
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
-
-  int fd = shm_open(ipc_name.c_str(), flags, 0644);
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
@@ -86,12 +235,14 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
 
 std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
     const std::string &ipc_name, size_t size) {
-  int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644);
+  int flags = O_RDWR | O_CREAT;
+  flags &= ~O_CREAT;
+
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
-
-  void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+  void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
                     platform::errors::Unavailable(
                         "Memory map failed when rebuild shared memory."));
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
index 3f91e5c42780826ae0ef2e61e982da2336d10a3f..4f8dbfbb51e66db227dfcf46bc3ce313d8406dd1 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -16,8 +16,9 @@
 
 #ifndef _WIN32
 
+#include <atomic>
 #include <memory>
-#include <mutex>  // NOLINT
+#include <mutex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -28,6 +29,72 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName();
+
+static constexpr int64_t mmap_alignment = 64;
+
+enum MappedModes {
+  MAPPED_SHAREDMEM = 1,
+  MAPPED_EXCLUSIVE = 2,
+  MAPPED_NOCREATE = 4,
+  MAPPED_KEEPFD = 8,
+  MAPPED_FROMFD = 16,
+  MAPPED_UNLINK = 32
+};
+
+class MemoryMapAllocation : public Allocation {
+ public:
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        map_ptr_(ptr),
+        map_size_(size) {}
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                               int flags, int fd)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        fd_(fd),
+        flags_(flags),
+        map_ptr_(ptr),
+        map_size_(size) {}
+
+  inline const std::string &ipc_name() const { return ipc_name_; }
+
+  virtual void close();
+
+  ~MemoryMapAllocation() override;
+
+ protected:
+  std::string ipc_name_;
+  int fd_ = -1;
+  int flags_ = 0;
+  void *map_ptr_ = nullptr;
+  size_t map_size_ = 0;
+  bool closed_ = false;
+};
+
+class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
+ public:
+  RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                                int flags, int fd);
+
+  void incref();
+  int decref();
+  void close() override;
+  virtual ~RefcountedMemoryMapAllocation() { close(); }
+
+ protected:
+  void initializeRefercount();
+  void resetBaseptr();
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **base_ptr_, int *fd_);
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size);
+
 class MemoryMapWriterAllocation : public Allocation {
  public:
   explicit MemoryMapWriterAllocation(void *ptr, size_t size,
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 66f1bcc8b68692abe588b6429b027462eaebde24..845d0ed073b32cc136ec6b9d76c9e3073d7b051a 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1482,6 +1482,20 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
 REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
 REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
 REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
+                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
+                       HardShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
+                       SoftShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                       TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                       HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+                       LogSigmoidGradFunctor);
 
 /* ==========================    sigmoid register  =============================
  */
@@ -1516,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
                       ops::SigmoidTripleGradFunctor<float>::FwdDeps()>,
                   ops::ActivationTripleGradOpInplaceInferer);
 
-// Register Sigmoid/GradSigmoid Kernels
-REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-                               SigmoidGradFunctor);
-
-// Register DoubleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
-
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ==========================    tanh register  ============================= */
@@ -1567,23 +1557,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
     ops::ActivationTripleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================    relu register  ============================= */
@@ -1623,16 +1596,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                               LeakyReluGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ========================    elu  register     ============================ */
@@ -1650,22 +1613,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(elu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ========================    logit  register     ============================
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4b79397b6cdf2e5c2993f7a72f512cc924c208e7..f1984af6e15eac6682bd341f470727b899e82f3a 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -238,21 +238,20 @@ struct BaseActivationFunctor {
   AttrPair GetAttrs() { return AttrPair(); }
 };
 
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
 #define USE_PHI_FUNCTOR(name)                         \
   template <typename T>                               \
   using name##Functor = phi::funcs::name##Functor<T>; \
   template <typename T>                               \
   using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
 
+#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
+
+#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
+
 USE_PHI_FUNCTOR(Cos)
 USE_PHI_FUNCTOR(Tan)
 USE_PHI_FUNCTOR(Acos)
@@ -264,181 +263,27 @@ USE_PHI_FUNCTOR(Cosh)
 USE_PHI_FUNCTOR(Asinh)
 USE_PHI_FUNCTOR(Acosh)
 USE_PHI_FUNCTOR(Atanh)
-
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut -> SigmoidGradGrad -> DOutNew
-    DDX                        DDOut
-
-    DDOut = (1-Out)*Out*DDX
-    DOutNew = (1-2*Out)*DOut*DDX
-*/
-template <typename T>
-struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
-
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
-      dout_new.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> SigmoidTripleGrad -> D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (1-2*Out)*DDx*D_Dout_new
-    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
-    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
-      d_dOut.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
-      d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// silu(x) = x / (1 + exp(-x))
-template <typename T>
-struct SiluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    out.device(d) = x * temp;
-  }
-};
-
-// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
-template <typename T>
-struct SiluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
-    auto temp2 = x * (-x).exp();                  // x*e^(-x)
-    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
-                           (static_cast<T>(1) + (temp2 / temp1)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
+USE_PHI_FUNCTOR(Tanh)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_FUNCTOR(BRelu)
+USE_PHI_FUNCTOR(ThresholdedRelu)
+USE_PHI_FUNCTOR(LeakyRelu)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+USE_PHI_FUNCTOR(HardShrink)
+USE_PHI_FUNCTOR(SoftShrink)
+USE_PHI_FUNCTOR(TanhShrink)
+USE_PHI_FUNCTOR(Silu)
+USE_PHI_FUNCTOR(ELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+USE_PHI_FUNCTOR(Sigmoid)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_FUNCTOR(LogSigmoid)
+USE_PHI_FUNCTOR(HardSigmoid)
+
+template <typename T>
+using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 
 // exp(x) = e^x
 template <typename T>
@@ -497,210 +342,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
-    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
-    // * ddx)
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
-      dout_new.device(*d) =
-          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> TanhTripleGrad ->    D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (-2) * Out * DDx * D_Dout_new
-    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
-    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
-      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
-                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
-      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
-      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
-                         static_cast<T>(2) * out * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 || temp2).template cast<T>();
-  }
-};
-
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@@ -909,42 +550,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // relu6(x) = min(max(0, x), 6)
 template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
@@ -1168,116 +773,28 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
+template <typename DeviceContext, typename T>
+class ELUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const float alpha = context.Attr<float>("alpha");
+    dX->mutable_data<T>(context.GetPlace());
 
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    if (alpha < 1.f) {
-      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-    } else {
-      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
-    }
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 =
-        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
-    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
-  }
-};
-
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 1: alpha >= 0
-    // dx = dout, if out > 0
-    // dx = dout * (out + alpha), if out <= 0
-    dx.device(d) = (out > static_cast<T>(0))
-                       .select(dout, dout * (out + static_cast<T>(alpha)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 2: alpha < 0
-    // dx = dout, if x > 0
-    // dx = dout * (out + alpha), if x <=0
-    dx.device(d) = (x > static_cast<T>(0))
-                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename DeviceContext, typename T>
-class ELUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    const float alpha = context.Attr<float>("alpha");
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "elu_grad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad"));
-    auto dx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad"));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "elu_grad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad"));
+    auto dout = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad"));
+    auto dx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad"));
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     if (alpha > 0) {
       ELUGradFunctor<T> functor;
@@ -1430,74 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct SwishFunctor : public BaseActivationFunctor<T> {
   float beta;
@@ -1531,121 +980,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-/*
- * in arguments: x, out, ddx
- * out arguments: ddout, dout, dx
- */
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationDoubleGradTensor(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** ddX,
-    framework::Tensor** dX, framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE_NOT_NULL(
-      ddx_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("DDX")));
-  if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
-    if (ddo_var) {
-      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          ddo_var);
-    }
-  } else {
-    *ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      *ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      *ddX,
-      platform::errors::NotFound(
-          "Cannot get the tensor from the Variable Output, variable name = %s",
-          ctx.OutputName("DDX")));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE_NOT_NULL(
-        x_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("X")));
-    auto dx_var = ctx.OutputVar("DX");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-      if (dx_var) {
-        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-            dx_var);
-      }
-    } else {
-      *X = ctx.Input<framework::Tensor>("X");
-      if (dx_var) {
-        *dX = ctx.Output<framework::Tensor>("DX");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *X = *ddX;
-  }
-  if (static_cast<int>(kDepValue) &
-      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Cannot get the tensor from the Variable Out, variable name = %s",
-            ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-      if (dout_var) {
-        *dOut =
-            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-                dout_var);
-      }
-    } else {
-      *Out = ctx.Input<framework::Tensor>("Out");
-      if (dout_var) {
-        *dOut = ctx.Output<framework::Tensor>("DOut");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *Out = *ddX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *Out, *ddX;
-    X = Out = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut, *dX;
-    ddOut = dOut = dX = nullptr;
-
-    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
-                                                          &dX, &dOut, &ddOut);
-
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
-    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
@@ -1667,73 +1001,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    if (ddOut) {
-      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto x = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
-
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
-      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
   float alpha;
@@ -1907,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
-template <typename DeviceContext, typename Functor>
-class SigmoidDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-    // extract ddx(input) and out(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    // set output ddout
-    ddOut = ctx.Output<framework::Tensor>("DDOut");
-    // extract dOut(intput)
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-// Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-// D_OutNew, D_DOut, D_DDx               // output
-template <typename DeviceContext, typename Functor>
-class SigmoidTripleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-
-    // extract ddx(input) and out(input)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable ddx, variable name = %s",
-                     ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable out, variable name = %s",
-                     ctx.InputName("Out")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-
-    // set output ddout
-    auto ddout_var = ctx.OutputVar("DDOut");
-    if (ddout_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-
-    // extract dOut(intput)
-    auto dout_var = ctx.InputVar("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dout_var, platform::errors::NotFound(
-                      "Cannot get input Variable dout_var, variable name = %s",
-                      ctx.InputName("DOut")));
-    dOut = ctx.Input<framework::Tensor>("DOut");
-
-    // set output dout_new
-    auto dout_new_var = ctx.OutputVar("DOutNew");
-    if (dout_new_var) {
-      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    }
-
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhTripeGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2493,29 +1555,19 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
-  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
-  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                      \
-  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
-  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
-  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
-          HardSigmoidGradFunctor);                                            \
-  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
-  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
+#define FOR_EACH_ACTIVATION_OP(__macro)                                      \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                     \
+  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                         \
+  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                         \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 92a101451e211f912e5390171654affa3be4e973..7c1b288080162e2a5bf847a795fc640ab5e5e4e1 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -15,170 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : static_cast<T>(alpha) * x;
-  }
-};
-
-template <typename T>
-struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > zero ? dout : static_cast<T>(alpha) * dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(one / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out * (one - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = one / (one + exp(-x));
-    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // logsigmoid(x) = log(1 / (1 + exp(-x)))
-  // For numerical stability,
-  // logsigmoid(x) =
-  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = x > zero ? zero : -x;
-    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // dx = dout * exp(-x) / (1 + exp(-x))
-  // For numerical stability:
-  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
-  // 0)))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp1);
-    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // softshrink(x) = x - lambda, if x > lambda;
-  //                 x + lambda, if x < -lambda;
-  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T x) const {
-    T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return temp1 * (x - l) + temp2 * (x + l);
-  }
-};
-
-template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T l = static_cast<T>(lambda);
-    return (x >= -l && x <= l) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -224,31 +65,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * (one - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -476,45 +292,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaBReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    T temp_max = x > t_min_cast ? x : t_min_cast;
-    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -711,109 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x - tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * tanh(x) * tanh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // hard_sigmoid(x) = 0, when x <= -3
-  //                   1, when x >= 3
-  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T x) const {
-    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    T temp_max = temp > zero ? temp : zero;
-    T temp_min = temp_max < one ? temp_max : one;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -907,38 +581,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > static_cast<T>(threshold) ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > static_cast<T>(threshold) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -991,110 +633,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // elu(x) = x, if x > 0
-  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
-    CT res = x > zero ? x : temp;
-    return static_cast<T>(res);
-  }
-};
-
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 1: alpha >= 0
-  // dx = dout, if out > 0
-  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType a = static_cast<MPType>(alpha);
-    MPType out_pos = static_cast<MPType>(out > zero);
-    MPType out_neg = static_cast<MPType>(out <= zero);
-    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 2: alpha < 0
-  // dx = dout, if x > 0
-  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType x_pos = static_cast<MPType>(x > zero);
-    MPType x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename DeviceContext, typename T>
-class ELUGradCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    const float alpha = ctx.Attr<float>("alpha");
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {d_out, out};
-    std::vector<framework::Tensor*> outs = {d_x};
-    if (alpha > 0) {
-      CudaELUGradFunctor<T> functor;
-      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    } else {
-      CudaELUGradNegativeAlphaFunctor<T> functor;
-      functor.alpha = alpha;
-      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
-
 template <typename T>
 struct CudaCELUFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
@@ -1212,6 +750,34 @@ class ActivationGradCudaKernel
   }
 };
 
+USE_PHI_FUNCTOR(CudaCos)
+USE_PHI_FUNCTOR(CudaTan)
+USE_PHI_FUNCTOR(CudaAcos)
+USE_PHI_FUNCTOR(CudaSin)
+USE_PHI_FUNCTOR(CudaAsin)
+USE_PHI_FUNCTOR(CudaAtan)
+USE_PHI_FUNCTOR(CudaSinh)
+USE_PHI_FUNCTOR(CudaCosh)
+USE_PHI_FUNCTOR(CudaAsinh)
+USE_PHI_FUNCTOR(CudaAcosh)
+USE_PHI_FUNCTOR(CudaAtanh)
+USE_PHI_FUNCTOR(CudaTanh)
+USE_PHI_FUNCTOR(CudaBRelu)
+USE_PHI_FUNCTOR(CudaLeakyRelu)
+USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaHardShrink)
+USE_PHI_FUNCTOR(CudaSoftShrink)
+USE_PHI_FUNCTOR(CudaTanhShrink)
+USE_PHI_FUNCTOR(CudaSilu)
+USE_PHI_FUNCTOR(CudaELU)
+USE_PHI_FUNCTOR(CudaSigmoid)
+USE_PHI_FUNCTOR(CudaLogSigmoid)
+USE_PHI_FUNCTOR(CudaHardSigmoid)
+
+template <typename T>
+using CudaELUGradNegativeAlphaFunctor =
+    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1270,40 +836,6 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<plat::bfloat16>>);
 
-/* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                                CudaLeakyReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ======================== elu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                   ops::CudaELUFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaELUFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaELUFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== celu register  ============================ */
@@ -1319,58 +851,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    sigmoid register  ============================
- */
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
-    ops::SigmoidTripleGradKernel<
-        plat::CUDADeviceContext,
-        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
-/* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
-                                CudaTanhGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_grad_grad,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
                                 CudaSqrtGradFunctor);
@@ -1508,9 +988,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
-  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
-          CudaLogSigmoidGradFunctor);                                         \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
@@ -1521,7 +998,6 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
   __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
   __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
-  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
   __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
   __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
@@ -1531,74 +1007,228 @@ REGISTER_OP_CUDA_KERNEL(
           CudaTanhShrinkGradFunctor);                                         \
   __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
           CudaHardShrinkGradFunctor);                                         \
-  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
-          CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
   __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
-  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
-          CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 #ifdef PADDLE_WITH_XPU_KP
-#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
-  REGISTER_OP_KERNEL(                                                          \
-      act_type, KP, plat::XPUPlace,                                            \
-      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
-  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
-                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
-                                                   ops::grad_functor<float>>);
-
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                               CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                               CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                               CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                               CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
-                               CudaHardSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
-                               CudaCELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
-                               CudaSqrtGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
-                               CudaSquareGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
-                               CudaSiluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                               CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                               CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                               CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                               CudaBReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                               CudaSoftReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                               CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                               CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                               CudaHardShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
-                               CudaHardSigmoidFunctor,
-                               CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
-                               CudaSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
-                               CudaThresholdedReluFunctor,
-                               CudaThresholdedReluGradFunctor);
+REGISTER_OP_KERNEL(
+    brelu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaBReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    brelu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaBReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCeilFunctor<float>>);
+REGISTER_OP_KERNEL(
+    ceil_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    celu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaCELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    elu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaExpFunctor<float>>);
+REGISTER_OP_KERNEL(
+    exp_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaExpGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaFloorFunctor<float>>);
+REGISTER_OP_KERNEL(
+    floor_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_shrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_shrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_sigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaHardSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    leaky_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaLeakyReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    leaky_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLogFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLog1pFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log1p_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLog1pGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    logsigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaLogSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    logsigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    reciprocal, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaReciprocalFunctor<float>>);
+REGISTER_OP_KERNEL(
+    reciprocal_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaReciprocalGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaRelu6Functor<float>>);
+REGISTER_OP_KERNEL(
+    relu6_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaRelu6GradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSiluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    silu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSiluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    soft_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftplusFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softplus_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftplusGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    softshrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaSoftShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softshrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftsignFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softsign_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftsignGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSqrtFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sqrt_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSqrtGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSquareFunctor<float>>);
+REGISTER_OP_KERNEL(
+    square_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    thresholded_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaThresholdedReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    thresholded_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaThresholdedReluGradFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 8fb9929c39e9223303f4427f1a0d7e1ed66134d4..88d7cb7c1f5f4bf47dc82f8632116424253d6d19 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -12,52 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/allclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct AllcloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    *out_data = true;
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      *out_data &= val;
-    }
-  }
-};
-
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({1}));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor,
+                            PD_INFER_META(phi::AllValueCompareInferMeta));
 REGISTER_OPERATOR(
     allclose, ops::AllcloseOp, ops::AllcloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::AllcloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel<CPU, float>,
-                       ops::AllcloseKernel<CPU, double>);
+    ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(allclose)
diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu
deleted file mode 100644
index 32c90ff8fdc109b30b140f0f70b336615ce93c17..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/allclose_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/allclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
-                                   const double rtol, const double atol,
-                                   bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct AllcloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, sizeof(bool));
-#else
-    cudaMemset(out_data, true, sizeof(bool));
-#endif
-    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,
-                        ops::AllcloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h
deleted file mode 100644
index 7a36754194ace5fad14d5a77e9d0be7f1c182087..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/allclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct AllcloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class AllcloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                        equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..237cfcc6f1172518097863158ca6dbd595af4186
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    MLUCnnlTensorDesc scale_desc(*scale);
+    MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY,
+                                     ToCnnlDataType<bool>());
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+
+      // check is_finite or is_nan
+      Tensor is_finite(found_inf->type());
+      if (i != 0) {
+        is_finite.Resize(phi::make_ddim({1}));
+        is_finite.mutable_data<bool>(ctx.GetPlace());
+      } else {
+        is_finite.ShareDataWith(*found_inf);
+      }
+
+      MLUCnnlTensorDesc x_desc(*x);
+
+      MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
+                        GetBasePtr(&is_finite));
+
+      // save is_finite by logical_and op after checking every input
+      if (i != 0) {
+        MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY,
+                                         ToCnnlDataType<bool>());
+        MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(),
+                       GetBasePtr(found_inf), is_finite_desc.get(),
+                       GetBasePtr(&is_finite), found_inf_desc.get(),
+                       GetBasePtr(found_inf));
+      }
+
+      // The normal logic is :
+      // out = in, if found_inf = true
+      // out = in/scale, if found_inf = false
+      // But when found_inf is true, the data of Out should not be used.
+      // So, on MLU, we always compute out with in/scale.
+      MLUCnnlTensorDesc out_desc(*out);
+      MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
+                   GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
+                   out_desc.get(), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
+                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 684ac5bafd0ef430f8424614104a865b3cbe29c6..ea6614cbfbdf874df029ab349f4373f27e5c8e21 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        if (ctx->IsRuntime()) {
-          // The runtime output shape is determined in kernel.
-          return;
-        } else {
-          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        }
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
@@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference {
   }
 };
 
-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of assign_op is not found."));
-    auto *out = ctx.OutputVar("Out");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(assign, ops::AssignOp,
                   ops::AssignGradMaker<paddle::framework::OpDesc>,
                   ops::AssignGradMaker<paddle::imperative::OpBase>,
                   ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer,
-                  ops::AssignInferVarType);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel, uint8_t,
-                               ops::AssignKernel, bool, ops::AssignKernel,
-                               plat::float16, ops::AssignKernel, plat::bfloat16,
-                               ops::AssignKernel);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel, uint8_t,
-                                ops::AssignKernel, bool, ops::AssignKernel,
-                                plat::float16, ops::AssignKernel);
-#endif
+                  ops::AssignInferVarType, AssignInferShapeFunctor);
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index b452dea8536dd98d6d4060d5224e39daf9137c50..b91eb50646feca30046915248d45ee2e91cabc39 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(assign);
+USE_OP_ITSELF(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 174207deb08b84194d6f20fe04e4c27245295caf..5194c8772e47bca5ec728079b4b2dce883e39c22 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -297,184 +300,6 @@ The required data format for this layer is one of the following:
 )DOC");
 }
 
-template <typename T>
-class BatchNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-
-    bool global_stats = test_mode || use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensionss is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded
-    // as NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    if (!global_stats) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
-
-      if ((N * sample_size) == 1) {
-        // Only 1 element in normalization dimension,
-        // we skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-        return;
-      }
-
-      switch (data_layout) {
-        case DataLayout::kNCHW: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        case DataLayout::kNHWC: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_mean_e += x_arr.col(i);
-          }
-          saved_mean_e /= N * sample_size;
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_variance_e +=
-                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Unknown storage order: %s", data_layout_str));
-      }
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        momentum = mom_tensor->data<float>()[0];
-      }
-
-      running_mean_arr =
-          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
-      running_var_arr =
-          running_var_arr * momentum + saved_variance_e * (1. - momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(
-          ctx.Input<Tensor>("Variance")->data<T>(), C);
-      inv_std = (var_arr + epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
-                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
-        C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
-                               N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
-                         N * sample_size) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
-             new_scale)
-                .colwise() +
-            new_bias;
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %d", data_layout));
-    }
-  }
-};
-
 void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   // check input
   OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad");
@@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-template <typename T>
-class BatchNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      // if the input of batch norm is stop_gradient, d_x is null.
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded as
-    // NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T *mean_data = saved_mean->data<T>();
-    const T *inv_var_data = saved_inv_variance->data<T>();
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    T *d_bias_data = nullptr;
-    T *d_scale_data = nullptr;
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
-      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // d_bias = np.sum(d_y, axis=0)
-    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
-    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
-    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
-    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-
-    if (d_scale && d_bias) {
-      d_bias_arr.setZero();
-      d_scale_arr.setZero();
-    }
-
-    if (d_x && (N * sample_size) == 1 && !use_global_stats) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      return;
-    }
-
-    int scale_coefff = use_global_stats ? 1 : N * sample_size;
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
-
-    Tensor dy_sum;
-    dy_sum.Resize({C});
-    dy_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_sum_arr(dy_sum.mutable_data<T>(ctx.GetPlace()),
-                                      C);
-
-    Tensor dy_mul_x_sub_mean_mul_invstd_sum;
-    dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
-    dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
-        dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace()), C);
-
-    dy_sum_arr.setZero();
-    dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
-
-    // inplace calculation
-    // Y:  ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    // X: (y - bias) / scale / (inv_var) + est_mean
-    //   formula transform ====>
-    //    (y - bias) / (scale * inv_var) + est_mean
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()),
-                                  sample_size, N * C);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
-                                 scale_inv_var_nhw(nc % C) / scale_coefff +
-                             mean_arr(nc % C);
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
-
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          dy_sum_arr(c) += d_y_arr.col(nc).sum();
-          dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
-                                   sample_size, N * C);
-          if (!use_global_stats) {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) =
-                  scale_inv_var_nhw(c) *
-                  (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
-                   (x_arr.col(nc) - mean_arr[c]) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr(c) *
-                       inv_var_arr(c));
-            }
-          } else {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
-            }
-          }
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()), C,
-                                  N * sample_size);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), C, N * sample_size);
-          for (int nhw = 0; nhw < N * sample_size; nhw++) {
-            x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw /
-                                  scale_coefff +
-                              mean_arr;
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
-
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          dy_sum_arr += d_y_arr.col(nhw);
-          dy_mul_x_sub_mean_mul_invstd_sum_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
-                                   N * sample_size);
-          if (!use_global_stats) {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) =
-                  scale_inv_var_nhw *
-                  (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
-                   (x_arr.col(nhw) - mean_arr) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
-            }
-          } else {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
-            }
-          }
-        }
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %s", data_layout_str));
-    }
-  }
-};
-
 template <typename T>
 void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetType(this->ForwardOpType() + "_grad");
@@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-    dX->mutable_data<T>(ctx.GetPlace());
-    ddY->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    const auto &x_dims = X->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = X->numel() / C;
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    // transpose NCHW -> NHWC for easy calculate
-    Tensor transformed_x(X->type());
-    Tensor transformed_dy(dY->type());
-    Tensor transformed_ddx(ddX->type());
-
-    Tensor transformed_dx(dX->type());
-    Tensor transformed_ddy(ddY->type());
-    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      // Input Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
-                                                         &transformed_x);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                         &transformed_dy);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                        &transformed_dy);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                         &transformed_ddx);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                        &transformed_ddx);
-      // Output Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
-                                                         &transformed_dx);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
-                                                         &transformed_ddy);
-    } else {
-      transformed_x.ShareDataWith(*X);
-      transformed_dy.ShareDataWith(*dY);
-      transformed_ddx.ShareDataWith(*ddX);
-
-      transformed_dx.ShareDataWith(*dX);
-      transformed_ddy.ShareDataWith(*ddY);
-    }
-
-    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    Tensor mean_tile;
-    mean_tile.Resize({C, sample_size});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    C, sample_size);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({C, sample_size});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-
-    mean_tile_data = mean_arr.replicate(1, sample_size);
-    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({C, sample_size});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     C, sample_size);
-    scale_tile_data = scale_arr.replicate(1, sample_size);
-
-    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
-    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({C, sample_size});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
-                              sample_size);
-      dx_arr.setZero();
-      if (use_global_stats) {
-        // math: dx = (ddscale * dy) * inv_var
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
-        }
-      } else {
-        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
-        // axis=(n,h,w)) *
-        //          np.sum(dy, axis=(n,h,w)) -
-        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
-        //          mean),
-        //          axis=(n,h,w)) * inv_var.pow(2) *
-        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
-        //          NxHxW *
-        //          np.sum(ddx * (x - mean)) *
-        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
-        //          np.sum(dy,
-        //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
-        //          inv_var
-        //          *
-        //          np.mean(dy, axis=(n,h,w)) -
-        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w)))
-
-        if (ddX) {
-          dx_arr +=
-              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-               inv_var_tile_data / sample_size)
-                  .colwise() *
-              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
-               (dy_arr * ddx_arr).rowwise().sum() +
-               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                   sample_size);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
-
-          dx_arr = scale_tile_data * dx_arr;
-        }
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr += (dy_arr * inv_var_tile_data -
-                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
-                      sample_size) *
-                         inv_var_tile_data -
-                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                         (dy_arr * x_sub_mean_mul_invstd_arr)
-                             .rowwise()
-                             .sum()
-                             .replicate(1, sample_size) /
-                         sample_size) *
-                    ddscale_tile_data;
-        }
-      }
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_dx, dX);
-      }
-    }
-    if (dScale) {
-      dScale->mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      dscale_arr.setZero();
-      if (use_global_stats) {
-        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
-        if (ddX) {
-          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
-        }
-      } else {
-        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
-        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
-        //            ddx
-        if (ddX) {
-          Tensor first_grad;
-          first_grad.Resize({C, sample_size});
-          EigenArrayMap<T> first_grad_arr(
-              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          first_grad_arr.setZero();
-
-          first_grad_arr +=
-              inv_var_tile_data *
-              (dy_arr -
-               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (dy_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
-        }
-      }
-    }
-
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
-                               C, sample_size);
-      ddy_arr.setZero();
-      if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var + ddbias +
-        //           ddscale * (x - mean) * inv_var
-        if (ddX) {
-          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
-        }
-      } else {
-        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
-        if (ddX) {
-          ddy_arr +=
-              scale_tile_data * inv_var_tile_data *
-              (ddx_arr -
-               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-        }
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-      }
-
-      if (ddBias) {
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
-
-        ddy_arr += ddbias_tile_data;
-      }
-
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_ddy, ddY);
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor,
+                            PD_INFER_META(phi::BatchNormInferMeta));
+
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::BatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index f8d37d685b929258118e5d4b9d02a6be9d71c078..d274e8d2c006d7cbfe8337eab5c6d9a57a62e5ca 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -113,23 +113,5 @@ class BatchNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class BatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 0e64b461786cce845f7388a520c09101dcba9c09..6507890a8b5dcd7a415215caf51bd05c2857db5e 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -20,6 +21,8 @@ namespace operators {
 
 template <typename T>
 class MLUBatchNormOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto &place = ctx.GetPlace();
@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
     // alloc memory
     y->mutable_data<T>(place);
-    mean_out->mutable_data<T>(place);
-    variance_out->mutable_data<T>(place);
-    saved_mean->mutable_data<T>(place);
-    saved_variance->mutable_data<T>(place);
+    mean_out->mutable_data<MPDType>(place);
+    variance_out->mutable_data<MPDType>(place);
+    saved_mean->mutable_data<MPDType>(place);
+    saved_variance->mutable_data<MPDType>(place);
 
     Tensor transformed_x;
     Tensor transformed_y;
@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
     auto d_x_tmp =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
+    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
+        scale->dims(), dev_ctx);
     auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
 
     if (d_x == nullptr) {
       d_x = &d_x_tmp;
@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     const auto &place = ctx.GetPlace();
     d_x->mutable_data<T>(place);
-    d_scale->mutable_data<T>(place);
-    d_bias->mutable_data<T>(place);
+    d_scale->mutable_data<MPDType>(place);
+    d_bias->mutable_data<MPDType>(place);
 
     use_global_stats = is_test || use_global_stats;
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index a70b6e991161dfef99cc0b6da9fba9a2696cc08e..ae03ecbcb16a0441cdb87e0ec579c07d872bc9a2 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       auto *variance_out = ctx.Output<Tensor>("VarianceOut");
       auto *saved_mean = ctx.Output<Tensor>("SavedMean");
       auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      mean_out->mutable_data<T>(ctx.GetPlace());
-      variance_out->mutable_data<T>(ctx.GetPlace());
-      saved_mean->mutable_data<T>(ctx.GetPlace());
-      saved_variance->mutable_data<T>(ctx.GetPlace());
+      mean_out->mutable_data<float>(ctx.GetPlace());
+      variance_out->mutable_data<float>(ctx.GetPlace());
+      saved_mean->mutable_data<float>(ctx.GetPlace());
+      saved_variance->mutable_data<float>(ctx.GetPlace());
 
       // if MomentumTensor is set, use MomentumTensor value, momentum
       // is only used in this training branch
@@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale->mutable_data<float>(ctx.GetPlace());
+      d_bias->mutable_data<float>(ctx.GetPlace());
       if (use_global_stats) {
         const auto *running_mean = ctx.Input<Tensor>("Mean");
         const auto *running_variance = ctx.Input<Tensor>("Variance");
diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
index 9dc287ab76a67c6026ec8794793e77179063af3d..c39743ef9914c039f13428d43a66b1aa66ada0ed 100644
--- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8213e877f722433488cd826bb63cba376972c57a..9be63a85fc0de3ba75cb9741b25f7c312cd9f60b 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -27,6 +27,9 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad,
 REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
 // depthwise convolution op
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad,
                   ops::Conv2DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv3DGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 8897f7b229c321e28609d8ef739f4388f5cb586a..fcda16a3e72ac9250a0206e69f50c75d71cb0d64 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
+      filter_grad->mutable_data<float>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
       const auto& runner = NpuOpRunner(
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index bff6673429d9a4088c65f9dc02c1546f23d96878..889cdac8f6882744c7a7044861d237964e6f6ac0 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cumprod_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class CumprodOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -81,22 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                   ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
+                  CumprodInferShapeFunctor);
 
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod, ops::CumprodOpCPUKernel<float>, ops::CumprodOpCPUKernel<double>,
-    ops::CumprodOpCPUKernel<int>, ops::CumprodOpCPUKernel<int64_t>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCPUKernel<float>,
-    ops::CumprodGradOpCPUKernel<double>, ops::CumprodGradOpCPUKernel<int>,
-    ops::CumprodGradOpCPUKernel<int64_t>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
deleted file mode 100644
index f792d6832917f52573dce7ee3e449c2f4be63584..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumprod_op.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/transform.h>
-#include "paddle/fluid/operators/cumprod_op.h"
-#include "paddle/fluid/operators/math/inclusive_scan.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MultiplyFunctor {
-  HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-class CumprodOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *y = ctx.Output<framework::Tensor>("Out");
-    auto dim = ctx.Attr<int>("dim");
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-
-    const auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast<T>(1),
-        MultiplyFunctor<T>(), /*reverse=*/false, dev_ctx);
-  }
-};
-
-template <typename T>
-struct IsZeroFunctor {
-  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
-};
-
-template <typename T>
-struct CumprodGradFunctorExceptFirstZero {
-  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
-      const T *x, const T *y, const T *dy_mul_y_reversed_cumsum,
-      const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx,
-      int64_t *first_zero_idx, T *x_filled_one)
-      : x_(x),
-        y_(y),
-        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
-        zero_mask_(zero_mask),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx),
-        first_zero_idx_(first_zero_idx),
-        x_filled_one_(x_filled_one) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto inner_idx = idx % inner_dim_;
-    auto outer_idx = idx / (mid_dim_ * inner_dim_);
-    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
-    auto mask = zero_mask_[idx];
-    bool should_fill_one = true;
-
-    if (mask == 0) {
-      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
-      if (mid_idx == mid_dim_ - 1) {
-        // record first zero position as -1, i.e., no zero
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
-      }
-    } else if (mid_idx > 0) {                  // mask > 0
-      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
-        dx_[idx] = 0;
-        should_fill_one = false;
-      } else {
-        // idx is the first zero position, it should be recorded
-        dx_[idx] = y_[idx - inner_dim_];
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
-      }
-    } else {  // the first zero position is index 0
-      dx_[idx] = 1;
-      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
-    }
-
-    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  const T *dy_mul_y_reversed_cumsum_;
-  const uint8_t *zero_mask_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-  int64_t *first_zero_idx_;
-  T *x_filled_one_;
-};
-
-template <typename T>
-struct FillFirstZeroPositionGradFunctor {
-  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
-                                              const T *grad_value,
-                                              size_t mid_dim, size_t inner_dim,
-                                              T *dx)
-      : first_zero_idx_(first_zero_idx),
-        grad_value_(grad_value),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto outer_idx = idx / inner_dim_;
-    auto inner_idx = idx % inner_dim_;
-    auto mid_idx = first_zero_idx_[idx];
-    if (mid_idx >= 0) {
-      auto full_idx =
-          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
-      dx_[full_idx] *= grad_value_[full_idx];
-    }
-  }
-
- private:
-  const int64_t *first_zero_idx_;
-  const T *grad_value_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-};
-
-/*
-Reference to
-https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
-input: x, y, dL/dy
-output: dL/dx
-dL/dx[i] = sum{0<=j<n} (dL/dy[j])*(dy[j]/dx[i]) (1)
-         = sum(0<=j<n} (dL/dy[j])*(d(x[0]*x[1]*...*x[j])/dx[i])
-if x[i] != 0, dL/dx[i] = sum{i<=j<n} (dL/dy[j])*(y[j]/x[i]) (2)
-if x[i] == 0, the formula(2) can not be applied directly.
-Suppose k is the first index of zero element, the formula will be:
-i > k, dL/dx[i] = 0;
-i < k, dL/dx[i] = 1/x[i]*sum{i<=j<n} (dL/dy[j]*y[j])
-i = k, dL/dx[i] = y[i-1]*sum{i<=j<n} (dL/dy[j])*(x[i+1]*...*x[j])
-
-First, we will show the main resolution.
-We need to judge the relationship between i (current index) and k (index
-which corresponds to the first element of 0).
-To mark the relationship, we now introduce zero_mask and we also need to
-mark the index of the first zero element.
-zero_mask = cummax(x[i] == 0);      //label whether x[i]==0 until the index.
-zero_index = -1;                    //store the first zero element's index.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     zero_mask = [0, 0, 0, 1, 1, 1, 1];
-     zero_index = 3;
-When i < k, we need to calculate the result of sum{i<=j<n}(d_y[j]*y[j]), we can
-use reversed cumsum to calculate it.
-R = reversed_cumsum(dy[j]*y[j]);     //store the calculation result of the
-sum{i<=j<n}(d_y[j]*y[j]) and x[k+1],x[k+2],...,x[j] along the index k+1 ~ j.
-When i = k, we need to calculate the result of prod{i<w<j}(x[w]).
-To calculate it, we introduce x_filled_one, which fill 1 before x[k+1] along
-the index 0 ~ k.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     x_filled_one = [1, 1, 1, 1, 2, 3, 0];
-Thus, we can use cumprod(x_filled_one[j]) to calculate the result of
-prod{k<=w<j}(x[w]).
-
-Then, we will show more detailed implementation.
-for (int i = 0; i < numel; i++) {
-    if (zero_mask[i] == 0) {       //case i < k
-        dx[i] = R[i] / x[i];
-        x_filled_one[i] = 1;
-    } else {
-        if (i == 0) {              //case i = k
-            dx[i] = 1;
-            zero_index = i;
-            x_filled_one[i] = 1;
-        } else {
-            if (zero_mask[i-1] == 0) {    //case i = k
-                dx[i] = y[i-1];
-                zero_index = i;
-                x_filled_one[i] = 1;
-            } else {                  //case i > k
-                dx[i] = 0;
-                x_filled_one[i] = x[i];
-            }
-        }
-    }
-}
-T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j]));
-if (zero_index != -1) {
-    dx[zero_index] *= T[zero_index];
-}
-*/
-
-template <typename T>
-class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    const auto *y = ctx.Input<framework::Tensor>("Out");
-    const auto *dy =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto dim = ctx.Attr<int>("dim");
-
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-    if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
-
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    const auto *x_data = x->data<T>();
-    const auto *y_data = y->data<T>();
-    const auto *dy_data = dy->data<T>();
-
-    auto place = ctx.GetPlace();
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    auto *dx_data = dx->mutable_data<T>(place);
-
-    // deal with complex
-    const T *x_data_deal;
-    const T *y_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr y_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
-      y_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_x(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_y(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
-      for_range_y(functor_y);
-      x_data_deal = x_data_conj;
-      y_data_deal = y_data_conj;
-    } else {
-      x_data_deal = x_data;
-      y_data_deal = y_data;
-    }
-
-// Step 1: find cummax-ed zero mask of x
-#ifdef PADDLE_WITH_CUDA
-    const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
-#else
-    const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
-#endif
-    auto zero_mask_without_cummax =
-        memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_without_cummax_data =
-        reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
-    thrust::transform(
-        exec_policy, thrust::device_pointer_cast(x_data_deal),
-        thrust::device_pointer_cast(x_data_deal) + numel,
-        thrust::device_pointer_cast(zero_mask_without_cummax_data),
-        IsZeroFunctor<T>());
-
-    auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
-    math::InclusiveScan<uint8_t, cub::Max>(
-        zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim,
-        inner_dim, static_cast<uint8_t>(0), cub::Max(), /*reverse=*/false,
-        dev_ctx);
-    zero_mask_without_cummax = nullptr;
-
-    // Step 2: calculate reversed cumsum(dy * y)
-    auto dy_mul_y = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(y_data_deal),
-                      thrust::device_pointer_cast(dy_mul_y_data),
-                      MultiplyFunctor<T>());
-
-    auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_reversed_cumsum_data =
-        reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(), /*reverse=*/true, dev_ctx);
-
-    // Step 3: calculate the gradient value except the first zero position.
-    // The gradient value of the first zero position is filled with out[idx-1],
-    // while the gradient value of the other positions are calculated out
-    // completely. This functor also:
-    //  (1) find the first zero index, i.e., first_zero_idx_data.
-    //  (2) fill x_filled_one, which satifies
-    //      x_filled_one[i] = x[i], i > pos
-    //      x_filled_one[i] = 1, i <= pos
-    auto first_zero_idx =
-        memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t));
-    auto *first_zero_idx_data =
-        reinterpret_cast<int64_t *>(first_zero_idx->ptr());
-    auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, numel);
-    CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
-        x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data,
-        mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data);
-    for_range(functor_except_first_zero);
-
-    // Step 4: calculate cumprod of x_filled_one
-    auto *x_filled_one_cumprod_data =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(1), MultiplyFunctor<T>(), /*reverse=*/false,
-        dev_ctx);
-
-    // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
-    auto *dy_mul_x_filled_one_cumprod =
-        dy_mul_y_data;  // reuse former allocated memory
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(x_filled_one_cumprod_data),
-                      thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
-                      MultiplyFunctor<T>());
-    auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_x_filled_one_cumprod,
-        dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(),
-        /*reverse=*/true, dev_ctx);
-
-    // Step 6: fill zero pos gradient value
-    platform::ForRange<platform::CUDADeviceContext>
-        for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim);
-    FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
-        first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum,
-        mid_dim, inner_dim, dx_data);
-    for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod, ops::CumprodOpCUDAKernel<float>, ops::CumprodOpCUDAKernel<double>,
-    ops::CumprodOpCUDAKernel<int>, ops::CumprodOpCUDAKernel<int64_t>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCUDAKernel<float>,
-    ops::CumprodGradOpCUDAKernel<double>, ops::CumprodGradOpCUDAKernel<int>,
-    ops::CumprodGradOpCUDAKernel<int64_t>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
deleted file mode 100644
index 74ed2008ae98380388d874529264c6b6c0b5a49a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumprod_op.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim,
-                              size_t* outer_dim, size_t* mid_dim,
-                              size_t* inner_dim) {
-  PADDLE_ENFORCE_GE(
-      cumprod_dim, -dim.size(),
-      platform::errors::InvalidArgument(
-          "The input dim of CumprodOp should be larger than the opposite "
-          "rank of input x which is %d.But received dim=%d",
-          -dim.size(), cumprod_dim));
-  PADDLE_ENFORCE_LT(cumprod_dim, dim.size(),
-                    platform::errors::InvalidArgument(
-                        "The input dim of CumprodOp should be smaller than the "
-                        "rank of input x which is %d.But received dim=%d",
-                        dim.size(), cumprod_dim));
-  if (cumprod_dim < 0) cumprod_dim += dim.size();
-
-  *outer_dim = 1;
-  for (int i = 0; i < cumprod_dim; ++i) {
-    *outer_dim *= dim[i];
-  }
-  *mid_dim = dim[cumprod_dim];
-  *inner_dim = 1;
-  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
-    *inner_dim *= dim[i];
-  }
-}
-
-template <typename T>
-class CumprodOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int dim = context.Attr<int>("dim");
-
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    framework::DDim shape = x->dims();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t j = 0; j < mid_dim; j++) {
-        for (size_t k = 0; k < inner_dim; k++) {
-          size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
-          if (j == 0) {
-            out_data[pos] = x_data[pos];
-          } else {
-            out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    framework::DDim shape = x->dims();
-    Tensor* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-
-    auto* d_out_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->data<T>();
-    auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    // deal with complex
-    const T* x_data_deal;
-    const T* out_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr out_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
-      out_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_x(dev_ctx,
-                                                                 numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_out(dev_ctx,
-                                                                   numel);
-      phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
-      for_range_out(functor_out);
-
-      x_data_deal = x_data_conj;
-      out_data_deal = out_data_conj;
-    } else {
-      x_data_deal = x_data;
-      out_data_deal = out_data;
-    }
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t k = 0; k < inner_dim; k++) {
-        for (size_t j = 0; j < mid_dim; j++) {
-          size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
-          d_x_data[index] = 0;
-          for (size_t n = 0; n < mid_dim; n++) {
-            size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
-            T elem;
-            if (j == 0) {
-              elem = d_out_data[pos];
-            } else {
-              elem = d_out_data[pos] * out_data_deal[index - inner_dim];
-            }
-            if (pos > index) {
-              for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
-                elem *= x_data_deal[m];
-              }
-            } else if (pos < index) {
-              elem = static_cast<T>(0);
-            }
-            d_x_data[index] += elem;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
index b15efc5f84bdd0a62f3ee5deca01b1e601f19aed..6e15fd090b8c4feeb8837efb392a2d3a6a6b80c7 100644
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
@@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp,
 
 REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp);
 
-REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel<float>,
-                       ops::DeformableConvCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(deformable_conv_grad,
                        ops::DeformableConvGradCPUKernel<float>,
                        ops::DeformableConvGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 2c7d905c79b37e9b1c8777d62f1b593c8a8866a5..ad10abf9c647b588e8c66dea89588e344c46ae69 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n,
   }
 }
 
-template <typename DeviceContext, typename T>
-class DeformableConvCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor offset = *ctx.Input<Tensor>("Offset");
-    const Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
-
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset.data<T>();
-    const T* mask_ptr = mask.data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2col(
-          ctx.device_context(), input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename DeviceContext, typename T>
 class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(deformable_conv,
-                        ops::DeformableConvCUDAKernel<CUDA, float>,
-                        ops::DeformableConvCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(deformable_conv_grad,
                         ops::DeformableConvGradCUDAKernel<CUDA, float>,
                         ops::DeformableConvGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
index 66961655ee6ffa88e162477ad424eb10a0702b27..1176b96987ed6fbd0077e68d5bb0d4ece5c4b4f0 100644
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height,
   }
 }
 
-template <typename T>
-class DeformableConvCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset->numel() / offset->dims()[0];
-    int input_mask_dim = mask->numel() / mask->dims()[0];
-    auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset->data<T>();
-    const T* mask_ptr = mask->data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2colCPU(
-          dev_ctx, input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename T>
 class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 0d9fbf612f73c428fb8050fcfcc319ddafabe482..35e389090175f7768244b95b1d388ea0d735c2d5 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,8 +9,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor,
+                            PD_INFER_META(phi::YoloBoxInferMeta));
 REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    YoloBoxInferShapeFunctor);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 98247fbc862bbc199316a4d4c8971d0f4a159544..6959b5cf811069cc66321d2129a2b69d4e922f09 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,11 +24,6 @@ namespace operators {
 class DeterminantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
 };
 
 class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
-                   "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   framework::GradVarName("Out"), "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
-                   framework::GradVarName("Input"), "DeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
                   ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
+                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>,
+                  DeterminantInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(determinant,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, float>,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    determinant_grad, ops::DeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp,
+                  DeterminantGradInferShapeFunctor);
 
 REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
                   ops::SlogDeterminantOpMaker,
diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu
index d19d4c3d093860c1f603e4d752063b7a858c0460..d8237fa3004e65ce74ece55e4d0e62b5564e5c5f 100644
--- a/paddle/fluid/operators/determinant_op.cu
+++ b/paddle/fluid/operators/determinant_op.cu
@@ -17,14 +17,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    determinant, ops::DeterminantKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    determinant_grad,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     slogdeterminant, ops::SlogDeterminantKernel<plat::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index f89ecd37222870f73d00870c9454bf5590d504e3..a1fe8a25665ec84b38a535f541a2cbe33d0a7fcf 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,12 +22,15 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
@@ -40,232 +43,6 @@ T sign(T val) {
   return static_cast<T>(T(0) < val) - (val < T(0));
 }
 
-template <typename T>
-class EigenMatrix {};
-
-template <>
-class EigenMatrix<float> {
- public:
-  using MatrixType = Eigen::MatrixXf;
-};
-
-template <>
-class EigenMatrix<double> {
- public:
-  using MatrixType = Eigen::MatrixXd;
-};
-
-inline int64_t GetBatchCount(const framework::DDim dims) {
-  int64_t batch_count = 1;
-  auto dim_size = dims.size();
-  PADDLE_ENFORCE_GE(
-      dim_size, 2,
-      platform::errors::InvalidArgument(
-          "the input matrix dimension size should greater than 2."));
-
-  // Cumulative multiplying each dimension until the last 2 to get the batch
-  // count,
-  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
-  // 9.
-  for (int64_t i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-
-  return batch_count;
-}
-
-template <typename T>
-struct DeterminantFunctor {
-  void operator()(const Tensor& input, const framework::ExecutionContext ctx,
-                  int64_t rank, int64_t batch_count, Tensor* output) {
-    std::vector<T> input_vec;
-    std::vector<T> output_vec;
-    framework::TensorToVector(input, ctx.device_context(), &input_vec);
-    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
-      auto begin_iter = input_vec.begin() + i * rank * rank;
-      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
-      std::vector<T> sub_vec(begin_iter,
-                             end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
-      for (int64_t i = 0; i < rank; ++i) {
-        for (int64_t j = 0; j < rank; ++j) {
-          matrix(i, j) = sub_vec[rank * i + j];
-        }
-      }
-      output_vec.push_back(matrix.determinant());
-    }
-    framework::TensorFromVector(output_vec, output);
-  }
-};
-template <typename DeviceContext, typename T>
-class DeterminantKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto input_dim = vectorize(input->dims());
-    auto input_dim_size = input_dim.size();
-    auto* output = context.Output<framework::Tensor>("Out");
-
-    auto batch_count = GetBatchCount(input->dims());
-    VLOG(2) << "input dim:" << input->dims();
-    PADDLE_ENFORCE_GE(
-        input_dim_size, 2,
-        platform::errors::InvalidArgument(
-            "the input matrix dimension size should greater than 2."));
-    PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
-                      input_dim[input_dim_size - 2],
-                      platform::errors::InvalidArgument(
-                          "the input matrix should be square matrix."));
-    auto rank = input_dim[input_dim_size - 1];  // square matrix length
-    DeterminantFunctor<T>()(*input, context, rank, batch_count, output);
-    auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2);
-    if (input_dim_size > 2) {
-      output->Resize(output_dims);
-    } else {
-      // when input is a two-dimension matrix, The det value is a number.
-      output->Resize({1});
-    }
-    VLOG(2) << "output dim:" << output->dims();
-  }
-};
-
-template <typename T>
-struct FoundZeroFunctor {
-  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
-      : x_(x), numel_(numel), res_(res) {}
-  HOSTDEVICE void operator()(size_t idx) const {
-    if (*res_ || idx >= static_cast<size_t>(numel_)) {
-      // founded zero number
-      return;
-    }
-    *res_ = (x_[idx] == static_cast<T>(0));
-  }
-  const T* x_;
-  int64_t numel_;
-  bool* res_;
-};
-
-template <typename DeviceContext, typename T>
-inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
-                                  const framework::Tensor* det) {
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto numel = det->numel();
-
-  framework::Tensor dev_tensor;
-  auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
-
-  // set false
-  phi::funcs::SetConstant<DeviceContext, bool> zero;
-  zero(dev_ctx, &dev_tensor, false);
-
-  // find whether zero
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  FoundZeroFunctor<T> functor(det->data<T>(), numel, data);
-  for_range(functor);
-
-  // copy to host
-  dev_ctx.Wait();
-  framework::Tensor cpu_tensor;
-  framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor);
-
-  // if founded zero, the matrix is not invertible
-  // else the matrix is invertible
-  auto* res = cpu_tensor.data<bool>();
-  return !(*res);
-}
-
-template <typename DeviceContext, typename T>
-class DeterminantGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
-    const auto* input = context.Input<framework::Tensor>("Input");
-    const auto* det = context.Input<framework::Tensor>("Out");
-    const auto* grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ddet =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    auto input_dims_size = input->dims().size();
-    if (input_dims_size > 2) {
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size() + 2, input_dims_size,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else if (input_dims_size == 2) {
-      // input dims size 2 and grad dims size 1 is possible
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else {
-      // checked in forward, pass
-    }
-
-    auto& dev_ctx = static_cast<
-        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
-        orig_dev_ctx);
-
-    // Check Whether the matrix is invertible
-    // (matrix A not invertible) == (det(A)=0)
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
-      // The matrix is not invertible
-      VLOG(3) << "The input matrix not invertible!";
-      ddet->Resize(input->dims());
-      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
-                   ddet);
-      return;
-    }
-
-    // The matrix is invertible
-    // let |A| = Determinant(A)
-    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
-    // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
-    // -1)
-
-    // First: inverse(A)
-    framework::Tensor inverse_A;
-    // A must be square matrices!
-    inverse_A.Resize(input->dims());
-    inverse_A.mutable_data<T>(context.GetPlace());
-
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(orig_dev_ctx, *input, &inverse_A);
-
-    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
-
-    // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A =
-        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
-
-    VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
-            << transpose_inverse_A.dims();
-
-    // Third: dA * |A|
-    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
-    VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
-
-    // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
-    VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
-
-    // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
-
-    VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
-
-    framework::TensorCopy(res, context.GetPlace(), ddet);
-
-    ddet->Resize(input->dims());
-    VLOG(3) << "d|A| dims: " << ddet->dims();
-  }
-};
-
 template <typename T>
 struct SlogDeterminantFunctor {
   void operator()(const Tensor& input, const framework::ExecutionContext ctx,
@@ -280,7 +57,7 @@ struct SlogDeterminantFunctor {
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
       std::vector<T> sub_vec(begin_iter,
                              end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
+      typename phi::detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
       for (int64_t i = 0; i < rank; ++i) {
         for (int64_t j = 0; j < rank; ++j) {
           matrix(i, j) = sub_vec[rank * i + j];
@@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel<T> {
     auto input_dim_size = input_dim.size();
     auto* output = context.Output<framework::Tensor>("Out");
 
-    auto batch_count = GetBatchCount(input->dims());
+    auto batch_count = phi::detail::GetBatchCount(input->dims());
     VLOG(2) << "input dim:" << input->dims();
     PADDLE_ENFORCE_GE(
         input_dim_size, 2,
@@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
     auto absslogdet_val = slogdet_vec[0];
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, &absslogdet_val)) {
+    if (!phi::detail::CheckMatrixInvertible<
+            T, typename framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+            dev_ctx, &absslogdet_val)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 93fbff67e220bcf7d1f8dab112a07cc42649595f..ac8c12bcd7ebaa6f47e8d3582887ac327a9f8957 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class DiagV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "DiagV2Grad");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class DiagV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("diag_v2_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
                             PD_INFER_META(phi::DiagInferMeta));
 
-REGISTER_OPERATOR(
-    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    DiagInferShapeFunctor);
+REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+                  ops::DiagV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::DiagV2GradOpMaker<paddle::imperative::OpBase>,
+                  DiagInferShapeFunctor);
+
+REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp,
+                  ops::DiagGradV2NoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46..3d9950902acfe80a3cfef6c9efa2c6370e685c32 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      ctx->SetOutputDim("Mask", x_dims);
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,7 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor,
+                            PD_INFER_META(phi::DropoutInferMeta));
+
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>,
+                  DropoutInferShapeFunctor);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea..6daf05a9d778dfb194225f59321ffc3eb40235db 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,13 +21,13 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a995877778e4770ea8ae64c051a71b31c1fb1e29..c28abb916b7a7d59d5a1974bed63e43b2f32ef2c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 14baeaa74d2421135401e94fbc10367d50b876fe..54931d99292f9d1453e2a3deb72e75ed63c9f46f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -90,86 +87,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-// Fmax
-template <typename T>
-struct FMaxFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmax(a, b);
-  }
-};
-
-template <>
-struct FMaxFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmax(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
-// Fmin
-template <typename T>
-struct FMinFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmin(a, b);
-  }
-};
-
-template <>
-struct FMinFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmin(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
 template <typename T>
 struct MinGradXFunctor {
   inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 91da732ef0d3dfbda5d9b7734071ec5831bcfa3f..d91315cc511aa80c0e9c44ccc688b2746eac764e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp,
                   ops::ElementwiseFMaxGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 123332a4a23de5c9534c8523993b87d8738f9869..0d5f56fda17322d86ef13990e9fc2432816dc9cb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index cff30be50a3d14c646cb7d13d6d8aeeb3de250f4..afe1073d89a06618af95490ac6d264073bd930d4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMaxFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MaxGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
   }
 };
 
-template <typename T>
-struct FMaxGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x >= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x >= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x >= y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x >= y));
-  }
-};
-
-template <typename T>
-struct FMaxGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x >= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x >= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x >= y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x >= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMaxGradDx<T>, FMaxGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx<T>(),
-        FMaxGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 3a1951999546eb859f6299b0bf7b064ff1b90a1a..dad80a2c33f3abfde457a6d750f89e47374fae13 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp,
                   ops::ElementwiseFMinGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index 5af985567d898d500b59e10d6032be57871c7e98..fb8bc9ac7f83c8dd99e40685acc68eec4c77b3ce 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index 88fb044d42206eb0f89ac84df166e2e7ff33c5b3..283ad2adde978680d4d0c3a579d55e588368a28e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMinFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MinGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
     ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
-
-template <typename T>
-struct FMinGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x <= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x <= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x <= y));
-  }
-};
-
-template <>
-struct FMinGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x <= y));
-  }
-};
-
-template <typename T>
-struct FMinGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x <= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x <= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x <= y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x <= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMinGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMinGradDx<T>, FMinGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx<T>(),
-        FMinGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 58a3123c7e332f50b0830577436528f1e8df1cdf..6f4aba93d56e2a8227a8578067ac934d41243fb6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 838df2e1625912dad127b672228f9cc64eb7cec3..f9347d281043ecc63acdb8ca2fb0a18dae4adc47 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,100 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-        auto reorder_p =
-            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -116,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
 
-REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseAddMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_add>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_add>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
index 367d602f5902e816a468d43ccfa009fe35a045fc..c68aa8d3d1b46c9013c6fe6a12510f0cdb744682 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -1,146 +1,28 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout / y
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = -dout * out / y
-
-      platform::BinaryMKLDNNHandler<T> y_handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
-          y, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto y_memory = y_handler.AcquireSrcMemory(y);
-
-      dnnl::post_ops po;
-      po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_out_memory},
-          {DNNL_ARG_DST, *dst_dy_memory},
-          {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// TODO(piotrekobi) add int8, uint8 support
-REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
-                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                                            dnnl::algorithm::binary_div>)
-
-REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseDivMKLDNNGradKernel<float>)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
+                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                                            dnnl::algorithm::binary_div>)
+
+REGISTER_OP_KERNEL(
+    elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_div>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_div>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 763fc5f2674104a718e33f5ef5ac7b2a1a7b23f5..d1a1aa3008c8b33690ecd9ea85501ad0178f592a 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,23 +15,77 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = phi::vectorize(x->dims());
+  const auto dst_tz = phi::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 
 template <typename T, dnnl::algorithm BINARY_OP>
 class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
+ private:
+  dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
+    dnnl::post_ops post_operations;
+    if (ctx.HasAttr("activation_type")) {
+      const float scale = ctx.HasAttr("activation_scale")
+                              ? ctx.Attr<float>("activation_scale")
+                              : 1.0f;
+      const float alpha = ctx.HasAttr("activation_alpha")
+                              ? ctx.Attr<float>("activation_alpha")
+                              : 0.0f;
+      const float beta = ctx.HasAttr("activation_beta")
+                             ? ctx.Attr<float>("activation_beta")
+                             : 0.0f;
+
+      static std::unordered_map<std::string, dnnl::algorithm> algo_map = {
+          {"relu", dnnl::algorithm::eltwise_relu},
+          {"tanh", dnnl::algorithm::eltwise_tanh},
+          {"leaky_relu", dnnl::algorithm::eltwise_relu},
+          {"swish", dnnl::algorithm::eltwise_swish},
+          {"hardswish", dnnl::algorithm::eltwise_hardswish},
+          {"sqrt", dnnl::algorithm::eltwise_sqrt},
+          {"abs", dnnl::algorithm::eltwise_abs},
+          {"clip", dnnl::algorithm::eltwise_clip},
+          {"gelu", dnnl::algorithm::eltwise_gelu_erf},
+          {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
+          {"relu6", dnnl::algorithm::eltwise_bounded_relu},
+          {"sigmoid", dnnl::algorithm::eltwise_logistic}};
+
+      const auto& activation_type =
+          algo_map.find(ctx.Attr<std::string>("activation_type"));
+
+      if (activation_type != algo_map.end()) {
+        post_operations.append_eltwise(scale, activation_type->second, alpha,
+                                       beta);
+      }
+    }
+    return post_operations;
+  }
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto& dev_ctx =
@@ -47,9 +101,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     float scale_o = ctx.Attr<float>("Scale_out");
     int axis = ctx.Attr<int>("axis");
 
-    platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
-                                             ctx.GetPlace(), x, y, z, scale_x,
-                                             scale_y, scale_o);
+    platform::BinaryMKLDNNHandler<T> handler(
+        BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x,
+        scale_y, scale_o, get_post_ops(ctx));
 
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
@@ -64,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // operation.
     const bool reuse_x_memopry =
         x->numel() == z->numel() && x->IsSharedBufferWith(*z);
-    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    std::shared_ptr<dnnl::memory> dst_memory;
     if (reuse_x_memopry) {
       dst_memory = src_x_memory;
       // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
@@ -96,19 +150,187 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
   }
 };
 
-inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
-                                                     const Tensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
+template <typename T, dnnl::algorithm BINARY_OP>
+class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
 
-  size_t j = 0;
-  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-  for (size_t i = 0; i < src_tz.size(); ++i) {
-    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-    if (j == dst_tz.size()) break;
-  }
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
 
-  return dst_tz_ex;
-}
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto tz = phi::vectorize<int64_t>(dout->dims());
+    auto proto_type_dout = framework::TransToProtoVarType(dout->dtype());
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout),
+        onednn_engine);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+                                                      ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
+        platform::RecordEvent record_reorder(
+            "int_reorder", platform::TracerEventType::UserDefined, 2,
+            platform::EventRole::kUniqueOp);
+
+        reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
+      } else {  // elementwise_mul & elementwise_div
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f,
+            1.0f, 1.0f);
+
+        const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
+        const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
+        dst_memory = binary_handler.AcquireDstMemory(dx);
+
+        const auto binary_prim = binary_handler.AcquireForwardPrimitive();
+
+        const std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC_0, *src_dout_memory},
+            {DNNL_ARG_SRC_1, *src_y_memory},
+            {DNNL_ARG_DST, *dst_memory}};
+
+        binary_prim->execute(astream, args);
+      }
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    }
+
+    if (dy) {
+      dnnl::primitive_attr broadcast_reduction_attr;
+      std::shared_ptr<dnnl::memory> broadcast_src_memory;
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        if (dout->dims() == dy->dims()) {
+          auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+              dy, dout->format(), ctx.GetPlace());
+
+          dnnl::primitive_attr reorder_attr;
+          std::vector<float> scales(1);
+          scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
+          reorder_attr.set_output_scales(0, scales);
+          auto reorder_p = std::make_shared<dnnl::reorder>(
+              *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+          platform::RecordEvent record_reorder(
+              "int_reorder", platform::TracerEventType::UserDefined, 2,
+              platform::EventRole::kUniqueOp);
+          reorder_p->execute(astream, *reorder_src_memory_p,
+                             *reorder_dst_memory_p);
+
+          dst_memory = reorder_dst_memory_p;
+        } else {
+          broadcast_src_memory = reorder_src_memory_p;
+        }
+      } else {  // elementwise_mul & elementwise_div
+        std::unordered_map<int, dnnl::memory> args;
+        std::shared_ptr<dnnl::binary> binary_prim;
+        std::shared_ptr<dnnl::memory> post_op_memory;
+        std::shared_ptr<dnnl::memory> src_0_memory;
+        std::shared_ptr<dnnl::memory> src_1_memory;
+
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+            dout, x, nullptr, 1.0f, 1.0f, 1.0f);
+
+        src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
+
+        if (BINARY_OP == dnnl::algorithm::binary_div) {
+          platform::BinaryMKLDNNHandler<T> post_op_binary_handler(
+              dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(),
+              y, y, nullptr, 1.0f, 1.0f, 1.0f);
+
+          post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
+
+          dnnl::post_ops po;
+          po.append_binary(dnnl::algorithm::binary_div,
+                           post_op_memory->get_desc());
+
+          binary_handler = platform::BinaryMKLDNNHandler<T>(
+              dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+              dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
+
+          src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
+        }
+
+        src_0_memory = binary_handler.AcquireSrcMemory(dout);
+
+        const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                       ? binary_handler.AcquireDstMemory(dy)
+                                       : binary_handler.AcquireDstMemory();
+
+        binary_prim = binary_handler.AcquireForwardPrimitive();
+        args = {{DNNL_ARG_SRC_0, *src_0_memory},
+                {DNNL_ARG_SRC_1, *src_1_memory},
+                {DNNL_ARG_DST, *dst_dy_memory}};
+
+        if (BINARY_OP == dnnl::algorithm::binary_div)
+          args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                       *post_op_memory});
+
+        binary_prim->execute(astream, args);
+        broadcast_src_memory = dst_dy_memory;
+        dst_memory = dst_dy_memory;
+      }
+      astream.wait();
+      dy->set_layout(DataLayout::kMKLDNN);
+
+      if (dout->dims() != dy->dims()) {
+        // Broadcasting
+        if (BINARY_OP == dnnl::algorithm::binary_sub) {
+          dnnl::post_ops po;
+          po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+          broadcast_reduction_attr.set_post_ops(po);
+        }
+
+        platform::ReductionMKLDNNHandler<T> reduction_handler(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy),
+            broadcast_reduction_attr);
+        dst_memory = reduction_handler.AcquireDstMemory(dy);
+
+        auto reduction_p = reduction_handler.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *broadcast_src_memory},
+                                          {DNNL_ARG_DST, *dst_memory},
+                                      });
+        astream.wait();
+        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims()))));
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c03794012ff3b793684222c62f423edd6e8637f1..0ef5c5e628ce62084305fc95e66862a15822ecb3 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -1,127 +1,19 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout*y
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = dout*x
-      // Handler is having nullptr passed instead of output tensor as
-      // we want Dst buffer to be allocated by oneDNN not to use Tensor
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, x, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_x_memory},
-          {DNNL_ARG_DST, *dst_dy_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -132,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
 
-REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseMulMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_mul>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_mul>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
index 3c799008a2abcf3fc59da7b759c9d43f3e940e8e..510373831eb6db5c7ffed6e8e58cbfb0ae268a50 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -1,5 +1,4 @@
-
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,113 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-
-        dnnl::primitive_attr reorder_attr;
-        std::vector<float> scales = {-1};
-        reorder_attr.set_output_scales(0, scales);
-        auto reorder_p = std::make_shared<dnnl::reorder>(
-            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-
-        dnnl::post_ops po;
-        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
-        dnnl::primitive_attr attr;
-        attr.set_post_ops(po);
-
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
-
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        reduction_p->execute(astream, {
-                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
-                                          {DNNL_ARG_DST, *dy_memory_p},
-                                      });
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 
@@ -131,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
 
-REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseSubMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_sub>)
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 97a35a34f23e96707269482e29da13a15538cdca..9361edd43bf15ac0eee4a4de618027af79b78b56 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,7 +12,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,27 +24,6 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_shape = ctx->Attrs().Get<std::vector<int>>("target_shape");
-    PADDLE_ENFORCE_GE(
-        target_shape.size(), static_cast<size_t>(x_dims.size()),
-        platform::errors::InvalidArgument(
-            "The rank of target_shape must be greater than or equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u; received target_shape: rank %u.",
-            x_dims.size(), target_shape.size()));
-    PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank of target_shape must be less than or equal "
-                          "to %d. But received: rank %u.",
-                          MAX_RANK_SUPPORTED, target_shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(target_shape));
-  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor,
+                            PD_INFER_META(phi::ExpandAsInferMeta));
 REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
+                  ExpandAsInferShapeFunctor);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 9f7e4fb8d5749cf6bd54ed3e3bf9699199c0d3e6..70597be393c35e6939b83d86ce2f9be8f2c36805 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   extern __shared__ char* shared_max_data_tmp[];
   auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   if (gridDim.x > 1) {
-    shared_max_data[tid] = T(0);
+    T local_max_data = T(0);
     for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
       T tmp = abs(in[i]);
-      if (tmp > shared_max_data[tid]) {
-        shared_max_data[tid] = tmp;
+      if (tmp > local_max_data) {
+        local_max_data = tmp;
       }
     }
+    shared_max_data[tid] = local_max_data;
   } else {
     if (bid < n) {
       shared_max_data[tid] = abs(in[bid]);
@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   extern __shared__ T shared_max_data[];
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T tmp = fabs(in_c[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
     if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   const T* in_current = in + tid * cout_wh_size + bid * wh_size;
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
     T tmp = fabs(in_current[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
 
   int len = blockDim.x;
@@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
+                                              const T* in_accum,
+                                              const T* cur_scale, const T rate,
+                                              T* out_state, T* out_accum,
+                                              T* out_scale) {
+  T state = rate * (*in_state) + T(1.0f);
+  T accum = rate * (*in_accum) + (*cur_scale);
+  *out_state = state;
+  *out_accum = accum;
+  *out_scale = accum / state;
+}
+
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
@@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   framework::Tensor* out_accum, framework::Tensor* out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
-    T accum;
-    T state;
-    T scale;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 ctx.stream());
-    ctx.Wait();
-
     T rate_t = static_cast<T>(rate);
-    state = rate_t * state + static_cast<T>(1.0);
-    accum = rate_t * accum + scale;
-    scale = accum / state;
-
-    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
-    ctx.Wait();
+    T* out_state_data = out_state->mutable_data<T>(gpu_place);
+    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
+        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
+        out_state_data, out_accum_data, out_scale_data);
   }
 };
 
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 508730c3c7335dbad8cf70417d2c19be4a8480a2..7870efba4e7a1a285bbd4b28b04c2b15f263c347 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel(
 
   if (N < ins_end) ins_end = N;
 
-  /*
-    if (!x1_lods_filled) {
-      for (int p = ins_start; p < ins_end; p++) {
-        x1_lods_data[p] = p;
-      }
-      if (idx == 0) {
-        x1_lods_data[N] = N;
-      }
-    }
-
-    if (!x2_lods_filled) {
-      for (int p = ins_start; p < ins_end; p++) {
-        x2_lods_data[p] = p;
-      }
-      if (idx == 0) {
-        x2_lods_data[N] = N;
-      }
-    }
-
-    if (!x1_lods_filled || !x2_lods_filled) {
-      b.sync();
-    }
-  */
-
   int flag_data[5];
   int prefix_sum_data[5];
   int prefix_sum_data2[5];
@@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel(
     local_addr = prefix_sum_data[ins_end - 1 - ins_start];
     sum_addr = local_addr;
 
-    // flag
-    // local_flag = 0;
     for (int p = ins_start; p < ins_end; p++) {
       local_flag += flag_data[p - ins_start];
     }
@@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel(
     sum_out_lods = local_out_lods;
   }
 
-  // 32 threads
   for (int i = 1; i < warp_thread_num; i *= 2) {
     int temp_addr = g.shfl_up(sum_addr, i);
     int temp_flag = g.shfl_up(sum_flag, i);
@@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel(
 
   if (ins_start < ins_end) {
     int out_lods_idx = p_flag + 1;
-
-    // ins_start = 1
-    // BUG fix
     for (int p = ins_start; p < ins_end; p++) {
       if (flag_data[p - ins_start] == 1) {
-        // batch_len = 2
-        // batch_len = 4
         size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
-        // t = 0
-        // t = 1
         int t = out_lods_idx - 1;
-        // out_lods_data[0] = 0;
         int previous;
-
         if (out_lods_idx == p_flag + 1) {
-          // out_lods_data[t] = p_out_lods;
           previous = p_out_lods;
         } else {
           previous = out_lods_data[t];
         }
-
         map_data[t * 3] = (int64_t)previous;
         map_data[t * 3 + 1] = x1_lods_data[p];
         map_lods_data[t] = t;
@@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel(
     if (sum_out_lods4 > 1) {
       int out_data_num = sum_out_lods4 - 1;
       int out_start = ins_start;
-
       if (out_start < out_data_num) {
         int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
         for (int p = out_start; p < out_end; p++) {
@@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel(
       if (flag_data[p - ins_start] == 1) {
         auto output_start_idx = prefix_sum_data2[p - ins_start];
         T* dst = out_data + output_start_idx * x1_embed_size;
-
         const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
         const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
-
-        // optimized
         for (const T *j = src_start; j != src_end; dst++, j++) {
           *dst = *j;
         }
@@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   int ins_start = idx * ins_per_thread;
   int ins_end = (idx + 1) * ins_per_thread;
-
   if (ins_start >= N) {
     return;
   }
   if (ins_end > N) ins_end = N;
-
   for (int p = ins_start; p < ins_end; p++) {
     T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
     const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
@@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
     const Tensor* x3 = context.Input<Tensor>("Filter_tag");
     const int64_t* x3_data = x3->data<int64_t>();
 
-    // int x2_lods_filled = 1;
-
     Vector<size_t> x2_lods;
-    // Vector, in GPU
     if (x2->lod().size() != 0) {  // lod_level = 1
       x2_lods = x2->lod()[0];
-      // x2_lods_filled = 1;
-
     } else {  // lod_level = 0
       const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_per_num = x2->dims()[1];
       // x2_lods.resize(x2->dims()[0] + 1);
       // move to cuda
       x2_lods.push_back(0);
       for (size_t i = 0; i < x2_lods_size; i++) {
-        x2_lods.push_back(i + 1);
+        x2_lods.push_back(x2_lods.back() + instag_per_num);
       }
     }
 
@@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
 
     size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
 
-    // Vector, in GPU
-    // int x1_lods_filled = 1;
     Vector<size_t> x1_lods;
-
     if (!is_x1_lod) {
-      // move to cuda
-      // x1_lods.resize(x1->dims()[0] + 1);
       x1_lods.push_back(0);
       for (int i = 0; i < x1->dims()[0]; i++) {
         x1_lods.push_back(i + 1);
@@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
       // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
       // new: lod_level=0 => lod() return {}
       if (x1->lod().size() != 0) {  // lod_level = 1
-        // x1_lods_filled = 1;
         x1_lods = x1->lod()[0];
       } else {  // lod_level = 0
         // x1_lods.resize(x1->dims()[0] + 1);
@@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
     LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
 
     int out_first = x1_lods.back();
-    // int out_first = x1->dims()[0];
-    // if (x1_lods_filled) {
-    //  out_first = x1_lods.back();
-    // }
 
     out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
     map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 5ef13b38c8a86e16cefdc97be6934b313fdb7bc4..feae954e355b85f5a18f8a48919770fd46a73f70 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index c445a28c084f67f2688e17994cb622903b73c707..e60fc44e9a6ffc106a9c6957c2365e7b44c467b9 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp {
       ctx->SetOutputsDim("Outputs", output_shapes);
     }
   }
+
+  std::vector<int64_t> ComputeOutputShape(
+      framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv");
+    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::string padding_algorithm =
+        ctx->Attrs().Get<std::string>("padding_algorithm");
+    int groups = ctx->Attrs().Get<int>("groups");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+    int dilation_size = dilations.size();
+    for (int i = 0; i < dilation_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          dilations[i], 0,
+          platform::errors::InvalidArgument(
+              "The dilation of Op(Conv) should be larget than 0, but received "
+              "dilation is %d.",
+              dilations[i]));
+    }
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+
+    // MKL-DNN Kernels are using NCHW order of dims description
+    // so we ignore data_format consideration for MKL-DNN kernel
+    const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
+                              (data_format == "NHWC" || data_format == "NDHWC");
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size() == 4 || in_dims.size() == 5, true,
+        platform::errors::InvalidArgument(
+            "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+            "received: input's dimension is %u, input's shape is [%s].",
+            in_dims.size(), in_dims));
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), filter_dims.size(),
+        platform::errors::InvalidArgument(
+            "The input's dimension and filter's dimension of "
+            "Op(Conv) should be equal. But received: the input's shape is "
+            "[%s], "
+            "the input's dimension is %d; the filter's shape is [%s],  "
+            "the filter's dimension is %d.",
+            in_dims, in_dims.size(), filter_dims, filter_dims.size()));
+
+    int stride_size = strides.size();
+    for (int i = 0; i < stride_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          strides[i], 0,
+          platform::errors::InvalidArgument(
+              "The stride of Op(Conv) should be larget than 0, but received "
+              "stride is %d.",
+              strides[i]));
+    }
+
+    int in_sub_stride_size = in_dims.size() - stride_size;
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), strides.size() + 2U,
+        platform::errors::InvalidArgument(
+            "The difference of input's dimension and Attr(strides)'s "
+            "length must be euqal to 2 for Op(Conv). "
+            "But received: input's dimension is %d, input's shape is [%s]; "
+            "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+            "difference of input's dimention and Attr(strides)'s length = %u.",
+            in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides),
+            in_sub_stride_size));
+
+    const auto input_channels =
+        channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
+
+    PADDLE_ENFORCE_EQ(
+        input_channels, filter_dims[1] * groups,
+        platform::errors::InvalidArgument(
+            "The number of input's channels should be equal to filter's "
+            "channels "
+            "* groups for Op(Conv). But received: the input's channels is %d, "
+            "the input's shape is [%s]; the filter's channels is %d, the "
+            "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+            "The error may come from wrong data_format setting.",
+            input_channels, in_dims, filter_dims[1], filter_dims, groups,
+            data_format));
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0] % groups, 0,
+        platform::errors::InvalidArgument(
+            "The number of output's channels (filter's first dimension) of "
+            "Op(Conv) should be divided by groups. But received: "
+            "the output channels is %d, the filter's shape is [%s], "
+            "the groups is %d.",
+            filter_dims[0], filter_dims, groups));
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GT(
+          filter_dims[0], 0,
+          platform::errors::InvalidArgument(
+              "the size of filter at axis 0 should be greater than 0"));
+    }
+
+    framework::DDim in_data_dims;
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    std::vector<int64_t> output_shape({in_dims[0]});
+    if (!channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+    for (int i = 0; i < in_data_dims.size(); ++i) {
+      if ((!ctx->IsRuntime()) &&
+          (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(
+            ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
+                           paddings[2 * i], paddings[2 * i + 1], strides[i]));
+      }
+    }
+    if (channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+
+    return output_shape;
+  }
 };
 
 // TODO(qingqing): add gradient operator for conv2d_fusion
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index 18c7187fc8e64c9fed8a86a984954b5420c1e5b5..a9b72a9cdf397f026f6ce24d83cc13066a3fd000 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -25,14 +25,16 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
 USE_OP_ITSELF(dropout);
-USE_OP(layer_norm);
+USE_OP_ITSELF(layer_norm);
 
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
@@ -136,18 +138,23 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
+  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(scale, ctx, tensor_scale);
     tensor_scale->Resize({cols});
+    scale_opt = *tensor_scale;
   }
 
+  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(bias, ctx, tensor_bias);
     tensor_bias->Resize({cols});
+
+    bias_opt = *tensor_bias;
   }
 
   auto var_x = scope.Var("X");
@@ -157,20 +164,19 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
 
   auto var_y = scope.Var("Y");
   auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+  tensor_y->Resize({rows, cols});
 
   auto var_mean = scope.Var("Mean");
   auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+  tensor_mean->Resize({rows});
 
   auto var_variance = scope.Var("Variance");
   auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"epsilon", epsilon});
-
-  auto op = framework::OpRegistry::CreateOp(
-      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
-      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
-  op->Run(scope, place);
+  tensor_variance->Resize({rows});
+  ctx.Wait();
+  phi::LayerNormKernel<T>(static_cast<const phi::GPUContext &>(ctx), *tensor_x,
+                          scale_opt, bias_opt, 1e-5, 1, false, tensor_y,
+                          tensor_mean, tensor_variance);
   framework::TensorToVector(*tensor_y, ctx, y);
   framework::TensorToVector(*tensor_mean, ctx, means);
   framework::TensorToVector(*tensor_variance, ctx, vars);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 032440d7f0478dc087e3ba38274f2a31a9a66a23..c7e1f4a5463fe11b9fa96f147b71004140130399 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias {
             residual_vec[i * cols + j] + out2[i * cols + j];
       }
     }
-
     LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
                  &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
                  *ctx);
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index e5ca15a39ef51f7807246c2ee1d473a0499b6463..7d7d6ae81a0935402f94cbc16e31fbba8009ce9c 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 8a405cc6fc1baefe997fb5b6133a56d6a2fc0438..9f2b48a24b44700dc93e9eba09ea2dd2a900bdfa 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherOp should not be null."));
-
-    auto index_dims = ctx->GetInputDim("Index");
-
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto input_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Axis") || axis == 0) {
-      // if HasInput("Axis"), we can not obtain correct shape of output
-      int batch_size = index_dims[0];
-      framework::DDim output_dims(input_dim);
-      output_dims[0] = batch_size;
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int index_size = index_dims[0];
-      std::vector<int> out_dim_vec;
-      for (int i = 0; i < axis; i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      out_dim_vec.push_back(index_size);
-      for (int i = axis + 1; i < input_dim.size(); i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      auto output_dims = phi::make_ddim(out_dim_vec);
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
+                            PD_INFER_META(phi::GatherInferMeta));
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   ops::GatherGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherGradOpMaker<paddle::imperative::OpBase>,
+                  GatherInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
-                       ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>,
-                       ops::GatherOpKernel<phi::dtype::bfloat16>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
-                       ops::GatherGradientOpKernel<double>,
-                       ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>,
-                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
+                  ops::GatherGradNoNeedBufferVarInferer,
+                  GatherGradInferShapeFunctor);
+
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
deleted file mode 100644
index e0db2f26d3e0534f924cc709b98689fb3f1a5cc6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_op.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      Tensor cpu_axis;
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT16) {
-        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
-      }
-    }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    const auto &dev_ctx = ctx.cuda_device_context();
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT16) {
-        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
-                                                     dev_ctx);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT16) {
-      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      Tensor cpu_axis;
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      }
-    }
-
-    const auto &dev_ctx = ctx.cuda_device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
-                                           ctx.Attr<bool>("overwrite"));
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
-                                               ctx.Attr<bool>("overwrite"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
-                        ops::GatherOpCUDAKernel<double>,
-                        ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<int16_t>,
-                        ops::GatherOpCUDAKernel<plat::float16>,
-                        ops::GatherOpCUDAKernel<plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
-                        ops::GatherGradOpCUDAKernel<double>,
-                        ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>,
-                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
deleted file mode 100644
index 94de694b2f9bc484cdb60298b60d5a9433dac181..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
-                                                 output);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
-                                                 output);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *dev_ctx.eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    if (index_type == phi::DataType::INT32) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-      }
-    } else if (index_type == phi::DataType::INT64) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index a83abb245224baf837296aa6be8f6ceb96ac700c..f996b1ede2f0fdbf7739d579380d71e9dc3448e7 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/kron_op.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 3dce380360815c292153ef2bfb1a447357c90acb..b42050eabe300bea59c95c50c356d9e115c0dddf 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -24,16 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/gather_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gather);
+USE_OP_ITSELF(gather);
 USE_OP_DEVICE_KERNEL(gather, NPU);
-USE_OP(gather_grad);
+USE_OP_ITSELF(gather_grad);
 USE_OP_DEVICE_KERNEL(gather_grad, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 28f2f7d473bef308f581266bdb1925864aca4b78..6c691aa14ae77acc3c4ebc2077ea9182e4354d54 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class GatherOpXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 3d338f00d4fcbf4be35b2392a10c275526dc5d4b..3be2606bfc93984f918adf595b522fe6bfca72be 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/operators/gelu_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of GeluOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of GeluOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker,
                   ops::GeluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GeluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GeluGradOpMaker<paddle::imperative::OpBase>,
+                  GeluInferShapeFunctor);
 REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
deleted file mode 100644
index ef836ab72f001a540e081d7e9975ca5ee28758be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gelu_op.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gelu_op.h"
-
-DECLARE_bool(use_fast_math);
-
-namespace paddle {
-namespace operators {
-
-#ifdef __NVCC__
-template <bool FastMode>
-static __device__ __forceinline__ float FP32FastTanh(float x) {
-#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
-  if (FastMode) {
-    float y;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
-    return y;
-  }
-#endif
-  return tanhf(x);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluFwd(float x) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  return x * 0.5f * (1.0f + tanh_out);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
-                         (0.79788456f + 0.1070322243f * x * x)) +
-             0.5f * (1.0f + tanh_out);
-  return tmp * y_g;
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      float tmp = __half2float(in_arr[i]);
-      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
-    }
-    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
-  }
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
-                                                 const __half* y_g, __half* x_g,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      __half2 tmp_fp16_2;
-      tmp_fp16_2.x = x_in_arr[i];
-      tmp_fp16_2.y = y_g_in_arr[i];
-      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
-      x_in_arr[i] =
-          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
-    }
-    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
-  }
-}
-
-static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y,
-    size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(y, kAlignment)) {                                          \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluFwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
-  return false;
-}
-
-static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x,
-    const __half* y_g, __half* x_g, size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
-        is_aligned(x_g, kAlignment)) {                                        \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluBwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y_g,    \
-                                                                   x_g, n);   \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
-  return false;
-}
-#endif
-
-template <typename T>
-struct GeluWithApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // this function is tanh approximation of gelu
-    MPType x = static_cast<MPType>(arg_x);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    auto tanh_out =
-        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
-    MPType out = x * half * (one + tanh_out);
-    return static_cast<T>(out);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // actual gelu with approximation = false
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x * normcdf(x));
-  }
-};
-
-template <typename T>
-class GeluKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    std::vector<const framework::Tensor*> ins = {in};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = in->numel();
-        const auto* in_ptr = reinterpret_cast<const __half*>(in->data<T>());
-        auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
-        if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr,
-                                                        out_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
-    }
-  }
-};
-
-template <typename T>
-struct GeluWithApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    MPType kBeta =
-        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
-    auto cube_x = x * x * x;
-    auto tanh_out =
-        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
-    auto ans =
-        half * (one + tanh_out +
-                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
-    return static_cast<T>(ans * dout);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
-    const MPType cdf = normcdf(x);
-    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
-    return static_cast<T>(dout * (cdf + x * pdf));
-  }
-};
-
-template <typename T>
-class GeluGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    std::vector<const framework::Tensor*> ins = {x, dout};
-    std::vector<framework::Tensor*> outs = {dx};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = x->numel();
-        const auto* x_ptr = reinterpret_cast<const __half*>(x->data<T>());
-        const auto* y_g_ptr = reinterpret_cast<const __half*>(dout->data<T>());
-        auto* x_g_ptr = reinterpret_cast<__half*>(dx->data<T>());
-        if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr,
-                                                        x_g_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
deleted file mode 100644
index d4fed8a868ff9e66f64c90ab9352e824ab673217..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gelu_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <algorithm>
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define GELU_CONSTANT 0.044715
-
-template <typename T>
-struct GeluFunctor {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out, bool approximate) const {
-    if (approximate) {
-      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp =
-            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
-             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
-                .tanh();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
-                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
-                        .tanh();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto out_data = out.data();
-      int n = std::min(x.size(), out.size());
-
-      std::memset(out_data, 0, n * sizeof(T));
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
-                                 out_data, 1);
-      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
-      for (int i = 0; i < n; i++) {
-        out_data[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
-      for (int i = 0; i < n; i++) {
-        out_data[i] *= static_cast<T>(0.5);
-      }
-#else
-      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-struct GeluGradFunctor {
-  template <typename Device, typename X, typename dOut, typename dX>
-  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
-    if (approximate) {
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-
-        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
-        const float kBeta =
-            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
-        const auto y =
-            (kAlpha *
-             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
-                .tanh();
-        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
-                        (static_cast<float>(1) + y +
-                         (casted_x - casted_x * y.square()) *
-                             (kAlpha + kBeta * casted_x.square())))
-                           .template cast<T>();
-      } else {
-        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
-        const T kBeta =
-            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
-        const auto y =
-            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
-        dx.device(d) = static_cast<T>(0.5) * dout *
-                       (static_cast<T>(1) + y +
-                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto dx_data = dx.data();
-      auto dout_data = dout.data();
-      int n = std::min(x.size(), dx.size());
-
-      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(first, 0, n * sizeof(T));
-      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(second, 0, n * sizeof(T));
-
-      // first = (0.5 * (1 + erf(x / sqrt(2))))
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, first,
-                                 1);
-      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
-      for (int i = 0; i < n; i++) {
-        first[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
-
-      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
-      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
-      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
-      phi::funcs::CBlas<T>::VEXP(n, second, second);
-      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
-      phi::funcs::CBlas<T>::SCAL(
-          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
-
-      // dx = dout * (first + second);
-      phi::funcs::CBlas<T>::VADD(n, first, second, first);
-      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
-
-      std::free(first);
-      std::free(second);
-#else
-      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
-      // exp(- x^2 / 2)
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-        auto first = static_cast<float>(0.5) *
-                     (static_cast<float>(1) +
-                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
-        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
-                      casted_x *
-                      (-static_cast<float>(0.5) * casted_x.square()).exp();
-        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
-      } else {
-        auto first =
-            static_cast<T>(0.5) *
-            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
-
-        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-                      (-static_cast<T>(0.5) * x.square()).exp();
-        dx.device(d) = dout * (first + second);
-      }
-#endif
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluFunctor<T> functor;
-    functor(place, eigen_in, eigen_out, approximate);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluGradFunctor<T> functor;
-    functor(place, eigen_x, eigen_dout, eigen_dx, approximate);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 18bbc7f4929c6493db9161d0415c0728eb8689c0..c5297dd9cd404b7637c2eec79dafcc027509ddcb 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index f3ac53138328dbfad12c6d530a6517f40c658677..b132b3170756d95adfde51e6d6ce7a5f0f25ca26 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
index b8c2e9becf2950d12f87ec5d61c05f3bf0010b12..559d2448ad94525d623e24fc8fb6c5e3881b58e3 100644
--- a/paddle/fluid/operators/gelu_op_xpu.cc
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-
-#include "paddle/fluid/operators/gelu_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 04aa6a3e10f6e3f55f9845d1b4b6bd6aa762c016..f6d3fd898469113dcffce76a84e4c292603707c6 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
 class GridSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GridSampleOp should be 4-D Tensor, but "
-                          "received X dimension size(%d)",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(Grid) of GridSampleOp should be 4-D Tensor, "
-                          "but received X dimension size(%d)",
-                          grid_dims.size()));
-    if (ctx->IsRuntime() || grid_dims[3] > 0) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[3], 2,
-          platform::errors::InvalidArgument(
-              "Input(Grid) dimension[3] should be 2, but received %d",
-              grid_dims[3]));
-    }
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[0], x_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Grid) dimension[0] should be equal, but "
-              "received X dimension[0](%d) != Grid dimension[0](%d)",
-              x_dims[0], grid_dims[0]));
-    }
-
-    ctx->SetOutputDim("Output",
-                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
-    ctx->ShareLoD("X", "Output");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                   framework::GradVarName("X"), "grid_sampler");
-    auto input_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
-      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -224,19 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
+                            PD_INFER_META(phi::GridSampleBaseInferMeta));
 REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                   ops::GridSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::GridSampleGradMaker<paddle::imperative::OpBase>,
+                  GridSamplerInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
+                  GridSamplerGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(grid_sampler)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
deleted file mode 100644
index a227a8e312765b4311314ea884f2c32443924fbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename T>
-static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
-                                                  int sW, int H, int W,
-                                                  T delta) {
-  if (in_bounds(h, w, H, W)) {
-    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize(T coord, int size,
-                                                 bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
-  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
-}
-
-template <typename T>
-static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
-                                                    int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<T>(0);
-  }
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = fabs(in - min);
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T compute_positions(T coord, int size,
-                                                      PaddingMode padding_mode,
-                                                      bool align_corners) {
-  coord = _unnormalize<T>(coord, size, align_corners);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes(coord, size - 1);
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_indexes(coord, -1, 2 * size - 1);
-    }
-    coord = clip_indexes(coord, size - 1);
-  }
-  return coord;
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
-                                                           bool align_corners,
-                                                           T* grad_in) {
-  if (align_corners) {
-    *grad_in = static_cast<T>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    *grad_in = static_cast<T>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
-                                                           T* grad_in) {
-  if (in <= static_cast<T>(0)) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  } else {
-    T max = static_cast<T>(clip_limit - 1);
-    if (in >= max) {
-      *grad_in = static_cast<T>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<T>(1);
-      return in;
-    }
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  }
-  int grad_in_mult_;
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<T>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<T>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<T>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
-                            bool align_corners, T* grad_in) {
-  T grad_clip, grad_refl;
-  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
-    } else {
-      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
-    }
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_refl * grad_clip;
-  }
-
-  return coord;
-}
-
-template <typename T>
-__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
-                                        int out_h, int out_w, int in_h,
-                                        int in_w, const T* input, const T* grid,
-                                        T* output, const Mode mode,
-                                        const PaddingMode padding_mode,
-                                        bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int out_sN = out_c * out_h * out_w;
-  int out_sC = out_h * out_w;
-  int out_sH = out_w;
-  int out_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    ix = compute_positions(ix, in_w, padding_mode, align_corners);
-    iy = compute_positions(iy, in_h, padding_mode, align_corners);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      auto inp_offset_NC = n * inp_sN;
-
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<T>(0);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
-          *out_ptr_NCHW =
-              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<T>(0);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-            << "; out_w: " << out_w;
-    auto* output = ctx.Output<Tensor>("Output");
-    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
-            << "; " << output->dims()[2] << "; " << output->dims()[3];
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sample_cuda_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
-        grid->data<T>(), output_data, mode, padding_mode, align_corners);
-  }
-};
-
-template <typename T>
-__global__ void grid_sampler_cuda_backward_kernel(
-    const int nthreads, const T* grad_output, const T* input, const T* grid,
-    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
-    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
-    bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-
-  int gOut_sN = out_c * out_h * out_w;
-  int gOut_sC = out_h * out_w;
-  int gOut_sH = out_w;
-  int gOut_sW = 1;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    T gix_mult, giy_mult;
-    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
-                                     &gix_mult);
-    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
-                                     &giy_mult);
-
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      T gix = static_cast<T>(0), giy = static_cast<T>(0);
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      int inp_offset_NC = n * inp_sN;
-      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
-               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        T gOut = grad_output[gOut_offset];
-
-        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
-                   nw * gOut);
-        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
-                   ne * gOut);
-        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
-                   sw * gOut);
-        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
-                   se * gOut);
-
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      for (int c = 0; c < out_c;
-           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
-                   in_w, grad_output[gOut_offset]);
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = static_cast<T>(0);
-        gGrid_ptr_NHW[1] = static_cast<T>(0);
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-        input_grad, static_cast<T>(0));
-
-    T* grid_grad_data = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sampler_cuda_backward_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
-        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
-        padding_mode, align_corners);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
-                        ops::GridSampleOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
-                        ops::GridSampleGradOpCUDAKernel<float>,
-                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
deleted file mode 100644
index 93e96694270a458844bbcabf78f2559975902c2f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ /dev/null
@@ -1,600 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <utility>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class Mode {
-  bilinear,
-  nearest,
-};
-
-enum class PaddingMode { zeros, border, reflect };
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-
-template <typename T>
-static inline bool isInBound(T x, T y, T x_max, T y_max) {
-  if (x < 0 || x > x_max || y < 0 || y > y_max) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-static inline void unnormalize(const platform::CPUDeviceContext& ctx,
-                               Tensor* grid_slice,
-                               const int max_val,  // height-1 or width-1
-                               bool align_corners) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-
-  if (!align_corners) {
-    auto factor = static_cast<T>((max_val + 1) * 0.5);
-    grid_slice_t.device(place) =
-        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
-  } else {
-    auto factor = static_cast<T>(max_val * 0.5);
-    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
-  }
-}
-
-template <typename T>
-static inline void clip(const platform::CPUDeviceContext& ctx,
-                        Tensor* grid_slice,
-                        const int max_val,  // height-1 or width-1
-                        bool align_corners, std::string padding_mode) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  if (padding_mode == "border") {
-    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                     .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                       .cwiseMin(static_cast<T>(max_val));
-    }
-  }
-}
-
-template <typename T>
-static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
-                                const int max_val,  // height-1 or width-1
-                                bool align_corners, std::string padding_mode,
-                                Tensor* grid_slice, Tensor* grid_scale) {
-  auto& place = *ctx.eigen_device();
-  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
-
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  auto factor = static_cast<T>(max_val * 0.5);
-  if (!align_corners) {
-    factor = static_cast<T>((max_val + 1) * 0.5);
-  }
-  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
-
-  if (padding_mode == "border") {
-    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
-    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
-                   .cwiseMin(static_cast<T>(max_val));
-
-    auto in_bound = (res == grid_slice_t);
-    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
-    grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto is_neg = (grid_slice_t < static_cast<T>(0));
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>());
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      auto reflected =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      auto clipped = reflected.cwiseMax(static_cast<T>(0))
-                         .cwiseMin(static_cast<T>(max_val));
-      auto in_bound = (clipped == reflected).template cast<T>();
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
-          in_bound;
-      grid_slice_t.device(place) = clipped;
-    }
-  }
-}
-
-template <typename T>
-static void calcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, const int in_h,
-                              const int in_w, bool align_corners,
-                              std::string padding_mode, Tensor* grid_x,
-                              Tensor* grid_y) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
-  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
-}
-
-template <typename T>
-static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
-                                      const Tensor& grid, const int in_h,
-                                      const int in_w, bool align_corners,
-                                      std::string padding_mode, Tensor* grid_x,
-                                      Tensor* grid_y, Tensor* grid_x_scale,
-                                      Tensor* grid_y_scale) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
-                  grid_x_scale);
-  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
-                  grid_y_scale);
-}
-
-template <typename T>
-static void getGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-  const int out_h = x.dims()[1];
-  const int out_w = x.dims()[2];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void allNeigbors(const platform::CPUDeviceContext& ctx,
-                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
-                        Tensor* y_s,  // positions
-                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
-                        Tensor* d_s,  // distance
-                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
-                        Tensor* v_es) {  // values
-  auto& place = *ctx.eigen_device();
-
-  const int c = input.dims()[1];
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
-  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
-  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
-  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-
-  x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + static_cast<T>(1);
-  y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + static_cast<T>(1);
-
-  // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
-  d_w_t.device(place) = grid_x_t - x_w_t;
-  d_e_t.device(place) = x_e_t - grid_x_t;
-  d_n_t.device(place) = grid_y_t - y_n_t;
-  d_s_t.device(place) = y_s_t - grid_y_t;
-
-  // calc 4 corner points value
-  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
-  getGridPointValue<T>(input, v_en, *x_e, *y_n);
-  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
-  getGridPointValue<T>(input, v_es, *x_e, *y_s);
-}
-
-template <typename T>
-static void bilinearInter(const platform::CPUDeviceContext& ctx,
-                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                          Tensor* out) {
-  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
-                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto d_w_scaled_t =
-      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_e_scaled_t =
-      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_n_scaled_t =
-      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_s_scaled_t =
-      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-  auto output_t = EigenTensor<T, 4>::From(*out);
-  // bilinear interpolaetion by 4 corner points
-  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                           v_en_t * d_w_scaled_t * d_s_scaled_t +
-                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                           v_es_t * d_w_scaled_t * d_n_scaled_t;
-}
-
-template <typename T>
-static void nearestInter(const platform::CPUDeviceContext& ctx,
-                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                         Tensor* out) {
-  auto& place = *ctx.eigen_device();
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-  grid_x_t = grid_x_t.round();
-  grid_y_t = grid_y_t.round();
-  getGridPointValue<T>(input, out, *grid_x, *grid_y);
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
-                               const Tensor& input, const Tensor& output_grad,
-                               Tensor* grid_x, Tensor* grid_y,
-                               Tensor* grid_x_scale, Tensor* grid_y_scale,
-                               Tensor* input_grad, Tensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input,
-                 grid_x,  // grid_x
-                 grid_y,  // grid_y
-                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
-                 &v_ws, &v_es);
-
-  // gather output grad value to input grad by corner point coords and weight
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
-
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  if (grid_grad != nullptr) {
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    auto grid_grad_x_t =
-        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
-    auto grid_grad_y_t =
-        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-
-    //  const T x_max = static_cast<T>(in_w - 1);
-    //  const T y_max = static_cast<T>(in_h - 1);
-
-    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
-    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
-    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
-    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * out_h * out_w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-
-    Tensor grid_x, grid_y;
-    calcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y);
-    if (mode == "bilinear") {
-      bilinearInter<T>(
-          ctx.template device_context<platform::CPUDeviceContext>(), *input,
-          &grid_x, &grid_y, output);
-    } else if (mode == "nearest") {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      getGridPointValue<T>(*input, output, grid_x, grid_y);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), input_grad,
-        static_cast<T>(0));
-
-    Tensor* grid_grad = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), grid_grad,
-          static_cast<T>(0));
-    }
-
-    Tensor grid_x, grid_y;
-    Tensor grid_x_scale, grid_y_scale;
-    calcGridLocationsWithGrad<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
-        &grid_y_scale);
-    if (mode == "bilinear") {
-      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
-                            *input, *output_grad, &grid_x, &grid_y,
-                            &grid_x_scale, &grid_y_scale, input_grad,
-                            grid_grad);
-    } else {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 2d284fb516e62b08fb48ab96d2478675c495c6f6..4331523d26edc1012ff67e4a08f69d682753bb7a 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance",
                    "GroupNormGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
                    framework::GradVarName("Y"), "GroupNormGrad");
 
@@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("group_norm_grad");
+    op->SetInput("X", this->Input("X"));
     op->SetInput("Scale", this->Input("Scale"));
     op->SetInput("Bias", this->Input("Bias"));
     op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
     op->SetInput("Y", this->Output("Y"));
+    op->SetInput("Mean", this->Output("Mean"));
     op->SetInput("Variance", this->Output("Variance"));
 
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index b376334f1e93cc3be9e716d808525011edb29b94..ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
-template <typename T, typename AccT, int VecSize>
-__device__ __forceinline__ void ThreadReduce(const T* input, int size,
-                                             const int offset, AccT* mean,
-                                             AccT* var) {
+template <typename T, typename AccT, int VecSize, int Num>
+__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
+                                             int size, const int offset,
+                                             AccT* out_mean, AccT* out_var) {
+  const T* x = arrs[0];
+  const T* y;
+  if (Num == 2) {
+    y = arrs[1];
+  }
   using VecT = kps::details::VectorType<T, VecSize>;
   int tid = threadIdx.x;
   if (offset > 0) {
-    input -= offset;
+    x -= offset;
+    if (Num == 2) {
+      y -= offset;
+    }
     size += offset;
     if (tid >= offset) {
-      AccT temp = input[tid];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += x[tid];
+        *out_var += x[tid] * x[tid];
+      } else if (Num == 2) {
+        *out_mean += y[tid];
+        *out_var += y[tid] * x[tid];
+      }
     }
     size -= blockDim.x;
-    input += blockDim.x;
+    x += blockDim.x;
+    if (Num == 2) {
+      y += blockDim.x;
+    }
   }
   int remain = size % (VecSize * blockDim.x);
 
-  T ins[VecSize];
-  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+  T ins_x[VecSize];
+  T ins_y[VecSize];
+  VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
+  VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
 
   // vector part
   for (; VecSize * tid < (size - remain); tid += blockDim.x) {
-    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+    *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
+    if (Num == 2) {
+      *ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
+    }
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      AccT temp = ins[i];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += ins_x[i];
+        *out_var += ins_x[i] * ins_x[i];
+      } else if (Num == 2) {
+        *out_mean += ins_y[i];
+        *out_var += ins_y[i] * ins_x[i];
+      }
     }
   }
 
   // scalar part
   tid = size - remain + threadIdx.x;
   for (; tid < size; tid += blockDim.x) {
-    AccT temp = input[tid];
-    *mean += temp;
-    *var += temp * temp;
+    if (Num == 1) {
+      *out_mean += x[tid];
+      *out_var += x[tid] * x[tid];
+    } else if (Num == 2) {
+      *out_mean += y[tid];
+      *out_var += y[tid] * x[tid];
+    }
   }
 }
 
@@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
   AccT x_var = static_cast<AccT>(0);
   const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * size;
-  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  phi::Array<const T*, 1> ins;
+  ins[0] = x;
+  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
+
   x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
       x_mean, kps::AddFunctor<AccT>());
   x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
@@ -310,10 +341,12 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int flags>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* scale, const T* bias, const T* d_y, int N, int C,
-    int W, int imsize, int groups, int group_size, T epsilon, T* d_mean,
-    T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) {
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
+                                               const T* bias, const T* d_y,
+                                               int N, int C, int W, int imsize,
+                                               int groups, int group_size,
+                                               T epsilon, T* d_mean, T* d_var,
+                                               T* d_scale, T* d_bias) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar(
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val, dval;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid] - x_bias;
-      dval = d_y[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
-      dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
+    dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
 
     d_var_data += val * dval;
     d_mean_data += dval * x_scale;
@@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
                                   const T* bias, const T* var, const T* d_mean,
                                   const T* d_var, int N, int C, int W,
                                   int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x,
-                                  const DataLayout data_layout) {
+                                  T epsilon, T* d_x) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
   if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    if (data_layout == DataLayout::kNCHW) {
-      T tmp = x[(bid * C + ccid) * imsize + imid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * C + ccid) * imsize + imid];
-      d_x[(bid * C + ccid) * imsize + imid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-      d_x[(bid * H + hid) * W * C + wid * C + ccid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+    int hid = imid / W;
+    int wid = imid % W;
+    T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
+    d_x[(bid * H + hid) * W * C + wid * C + ccid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+  }
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                            T* ds, T* db) {
+  int i = blockIdx.x;
+  AccT ds_sum = static_cast<AccT>(0);
+  AccT db_sum = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * imsize;
+
+  phi::Array<const T*, 2> ins;
+  ins[0] = x;
+  ins[1] = dy;
+  ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
+                                    &ds_sum);
+
+  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      ds_sum, kps::AddFunctor<AccT>());
+  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      db_sum, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ds[i] = ds_sum;
+    db[i] = db_sum;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                        T* ds, T* db) {
+  const int nc = blockIdx.x;
+  T ds_sum = 0;
+  T db_sum = 0;
+  for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
+    const int index = nc * imsize + i;
+    ds_sum += dy[index] * x[index];
+    db_sum += dy[index];
+  }
+  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
+  CudaAtomicAddWithWarp(&db[nc], db_sum);
+}
+
+template <typename T>
+__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group,
+                                               T epsilon, const T* mean,
+                                               const T* var, const T* ds,
+                                               const T* db, T* d_scale,
+                                               T* d_bias) {
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int G = group;
+    const int D = C / G;
+    T sum1 = 0;
+    T sum2 = 0;
+    for (int n = 0; n < N; ++n) {
+      const int nc = n * C + c;
+      const int ng = n * G + c / D;
+      sum1 += (d_scale == nullptr)
+                  ? T(0)
+                  : ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
+                     static_cast<T>(rsqrt(var[ng] + epsilon)));
+      sum2 += (d_bias == nullptr) ? T(0) : db[nc];
+    }
+    if (d_scale != nullptr) {
+      d_scale[c] = sum1;
+    }
+    if (d_bias != nullptr) {
+      d_bias[c] = sum2;
     }
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups,
+                                            int group_size, T epsilon,
+                                            const T* mean, const T* var,
+                                            const T* scale, const T* ds,
+                                            const T* db, T* p1, T* p2, T* p3) {
+  const int n = blockIdx.x;
+  const int g = blockIdx.y;
+  const int ng = n * groups + g;
+  T sum1 = 0;
+  T sum2 = 0;
+  T var_inv = rsqrt(var[ng] + epsilon);
+  for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
+    const int64_t index = ng * group_size + i;
+    const int64_t c = g * group_size + i;
+    const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
+    sum1 += ds[index] * scale_v;
+    sum2 += db[index] * scale_v;
+    const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
+    p1[index] = scale_c * var_inv;
+  }
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
+  sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    const T s = T(1) / static_cast<T>(group_size * imsize);
+    const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
+                static_cast<T>(var_inv) * static_cast<T>(var_inv) *
+                static_cast<T>(var_inv) * s;
+    p2[ng] = x;
+    p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
+  }
+}
+
+template <typename T>
+__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size,
+                                       int groups, T* p1, T* p2, T* p3,
+                                       const T* x, const T* dy, T* dx) {
+  int cid = blockIdx.x;
+  int gid = blockIdx.y;
+  int bid = blockIdx.z;
+  int ccid = bid * C + gid * group_size + cid;
+  int ng = bid * groups + gid;
+  int nc = gid * group_size + cid;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    int index = (bid * C + nc) * imsize + imid;
+    dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
+  }
+}
+
 template <typename T>
 class GroupNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -408,7 +552,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* mean = ctx.Input<Tensor>("Mean");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -433,31 +579,27 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
-    Tensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-    T* temp_var_data = temp_var.data<T>();
-
-    Tensor temp_mean;
-    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
-    T* temp_mean_data = temp_mean.data<T>();
+    Tensor ds, db;
+    ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    T* ds_data = ds.data<T>();
+    T* db_data = db.data<T>();
 
+    auto* y_data = y->data<T>();
     auto* x_data = x->data<T>();
     T* d_x_data = nullptr;
     if (d_x) d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
+    auto* dy_data = d_y->data<T>();
     auto* var_data = var->data<T>();
+    auto* mean_data = mean->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
       d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
       d_scale_data = d_scale->data<T>();
     }
     T* d_bias_data = nullptr;
     if (d_bias) {
       d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
       d_bias_data = d_bias->data<T>();
     }
 
@@ -479,22 +621,103 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
+    const int block_dims = 256;
 #else
     int block_size = std::min(1024, imsize);
+    const int block_dims = 1024;
 #endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
-                     bias_data, y_data, x_dims[0], C, W, imsize, groups,
-                     group_size, epsilon, temp_mean_data, temp_var_data,
-                     d_scale_data, d_bias_data, data_layout);
-    if (d_x_data != nullptr) {
-      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
-                       bias_data, var_data, temp_mean_data, temp_var_data,
-                       x_dims[0], C, W, imsize, groups, group_size, epsilon,
-                       d_x_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(imsize / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 blocks(block_size_nchw);
+      if (imsize < vec_size) {
+        if (d_scale) {
+          set_zero(dev_ctx, d_scale, static_cast<T>(0));
+        }
+        if (d_bias) {
+          set_zero(dev_ctx, d_bias, static_cast<T>(0));
+        }
+        ScalarGetDsDbCUDAKernel<
+            T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      } else {
+        VectorizedGetDsDbCUDAKernel<
+            T, AccT, vec_size><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      }
+
+      if (d_scale || d_bias) {
+        const int block = 256;
+        GetScaleBiasGradientCUDAKernel<
+            T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+            x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
+            db_data, d_scale_data, d_bias_data);
+      }
+
+      if (d_x_data != nullptr) {
+        // p1 * dy + p2 * x + p3,
+        // p1, p2, p3 represent the reverse calculation of temporary variables
+        // p1 = scale * var_inv
+        // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
+        // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
+        Tensor p1, p2, p3;
+        p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
+        p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        T* p1_data = p1.data<T>();
+        T* p2_data = p2.data<T>();
+        T* p3_data = p3.data<T>();
+
+        GetBackwardParamsCUDAKernel<T, block_dims><<<
+            dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+            imsize, groups, group_size, epsilon, mean_data, var_data,
+            scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
+        GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+            imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
+            dy_data, d_x_data);
+      }
+
+    } else {
+      if (d_scale) {
+        set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      }
+      if (d_bias) {
+        set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      }
+
+      Tensor temp_var;
+      temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+      T* temp_var_data = temp_var.data<T>();
+
+      Tensor temp_mean;
+      temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+      T* temp_mean_data = temp_mean.data<T>();
+
+      int flags = (scale_data != nullptr) * kHasScale +
+                  (bias_data != nullptr) * kHasBias;
+      UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data,
+                       scale_data, bias_data, dy_data, x_dims[0], C, W, imsize,
+                       groups, group_size, epsilon, temp_mean_data,
+                       temp_var_data, d_scale_data, d_bias_data);
+      if (d_x_data != nullptr) {
+        UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data,
+                         bias_data, var_data, temp_mean_data, temp_var_data,
+                         x_dims[0], C, W, imsize, groups, group_size, epsilon,
+                         d_x_data);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 9575ab54b32bda9292e5d266010484a34eae3e54..93f0d3d334f271ec7e40e38e9d654ad7f8ba3c59 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -60,31 +64,6 @@ namespace operators {
 class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid");
-
-    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
-    if (with_prefetch) {
-      OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid");
-    }
-    const int64_t input_dims = ctx->GetInputDim("X")[0];
-    const int64_t label_dims = ctx->GetInputDim("Label")[0];
-    PADDLE_ENFORCE_EQ(input_dims, label_dims,
-                      platform::errors::InvalidArgument(
-                          "The first dimension of "
-                          "input and label is expected to be the same. "
-                          "But received input's first dimension is %d; "
-                          "label's first dimension is %d.",
-                          input_dims, label_dims));
-
-    std::vector<int64_t> output_shape({input_dims, 1});
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
-    ops::HierarchicalSigmoidOpMaker<int>,
-    ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
-    ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>);
+DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid,
+                            HierarchicalSigmoidInferShapeFunctor,
+                            PD_INFER_META(phi::HierarchicalSigmoidInferMeta));
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>,
+                  HierarchicalSigmoidInferShapeFunctor);
 REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
                   ops::HierarchicalSigmoidGradOpGradVarTypeInference,
                   ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid_grad,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
deleted file mode 100644
index f11b28cfefb071182eb99cce3d8c2b7f2343cdf6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-using platform::Transform;
-using framework::LoDTensor;
-
-static std::vector<int64_t> PathToRows(const LoDTensor& path) {
-  std::set<int64_t> rows;
-  const int64_t* paths = path.data<int64_t>();
-  for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = paths[i];
-    if (row < 0) {
-      continue;
-    }
-    rows.emplace(row);
-  }
-  return std::vector<int64_t>(rows.begin(), rows.end());
-}
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoid");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoid");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoid");
-    auto* bias = ctx.Input<LoDTensor>("Bias");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* pre_out = ctx.Output<LoDTensor>("PreOut");
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    // for remote prefetch
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-    int64_t code_length =
-        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in.dims()[0];
-    LoDTensor sum;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* pre_out_data = pre_out->mutable_data<T>(
-        phi::make_ddim({batch_size, code_length}), ctx.GetPlace());
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
-    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
-    // 0s can avoid out of path's loss.
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, pre_out, static_cast<T>(0.0));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::RowwiseSum<DeviceContext, T> row_sum;
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    std::vector<int64_t> sum_dims({batch_size, 1UL});
-    sum.mutable_data<T>(phi::make_ddim(sum_dims), ctx.GetPlace());
-    auto sum_mat = EigenMatrix<T>::From(sum);
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_mat = framework::EigenMatrix<T>::From(*out);
-    if (bias) {
-      bit_code->Add(*bias, pre_out);
-    }
-    bit_code->Mul(pre_out, w, in);
-    // clip to [-40, 40]
-    Transform<DeviceContext> trans;
-    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
-          pre_out_data + pre_out->numel(), pre_out_data,
-          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
-    // use softrelu to calculate cross entropy
-    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
-    row_sum(dev_ctx, *pre_out, &sum);
-    // TODO(guosheng): Subtract the out of path's loss, since not all
-    // class(leaf) nodes' path lengths equal code_length. But it won't break the
-    // gradient check since both have the out of path's loss and will cancel out
-    // each other.
-    out_mat.device(place) = sum_mat + out_mat;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoidGrad");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoidGrad");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoidGrad");
-    auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
-                                    "PreOut", "HierarchicalSigmoidGrad");
-    auto& out_grad = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>(framework::GradVarName("Out")), "Input",
-        framework::GradVarName("Out"), "HierarchicalSigmoidGrad");
-    LoDTensor pre_out_grad;
-
-    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    // softrelu derivative
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    auto* pre_out_data = pre_out.template data<T>();
-    auto n = pre_out.numel();
-    blas.VEXP(n, pre_out_data, pre_out_grad_data);
-    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
-    for (int64_t i = 0; i < n; ++i) {
-      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
-    }
-    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    auto* out_grad_data = out_grad.template data<T>();
-
-    int64_t dim0 = pre_out_grad.dims()[0];
-    int64_t dim1 = pre_out_grad.dims()[1];
-    for (int64_t i = 0; i < dim0; ++i) {
-      T tmp = out_grad_data[i];
-      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
-    }
-    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
-    // be consistent with the clipping in forward.
-    auto* bias_grad = ctx.Output<LoDTensor>(framework::GradVarName("Bias"));
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code->AddGrad(pre_out_grad, bias_grad);
-    }
-    if (!is_sparse) {
-      auto* w_grad = ctx.Output<LoDTensor>(framework::GradVarName("W"));
-      w_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, w_grad, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    } else {
-      PADDLE_ENFORCE_NOT_NULL(path,
-                              platform::errors::NotFound(
-                                  "Custom tree must be set for sparse mode!"));
-      framework::Vector<int64_t> real_rows = PathToRows(*path);
-      auto* w_grad = ctx.Output<phi::SelectedRows>(framework::GradVarName("W"));
-      w_grad->set_rows(real_rows);
-      // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->set_height(w.dims()[0]);
-      auto* w_grad_value = w_grad->mutable_value();
-      framework::DDim temp_dim(w.dims());
-      temp_dim[0] = real_rows.size();
-      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
-      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    }
-    bit_code->MulGradError(pre_out_grad, w, in_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc
index 92cc6077defcd3f2b27c1b45875014742bc792ae..c9fd75651b5892beffa3b2aad7c21a0805facfce 100644
--- a/paddle/fluid/operators/histogram_op.cc
+++ b/paddle/fluid/operators/histogram_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram");
-    const auto &nbins = ctx->Attrs().Get<int64_t>("bins");
-    const auto &minval = ctx->Attrs().Get<int>("min");
-    const auto &maxval = ctx->Attrs().Get<int>("max");
-
-    PADDLE_ENFORCE_GE(nbins, 1,
-                      platform::errors::InvalidArgument(
-                          "The bins should be greater than or equal to 1."
-                          "But received nbins is %d",
-                          nbins));
-    PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument(
-                                          "max must be larger or equal to min."
-                                          "But received max is %d, min is %d",
-                                          maxval, minval));
-
-    ctx->SetOutputDim("Out", phi::make_ddim({nbins}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor,
+                            PD_INFER_META(phi::HistogramInferMeta));
+
 REGISTER_OPERATOR(
     histogram, ops::HistogramOp, ops::HistogramOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    HistogramInferShapeFunctor);
diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc
index fea71edf41313f9a93c3a2a0311d0db69db3b41c..069cc9416a620cec987f6463841ecd677db8c7b4 100644
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/index_select_op.h"
+
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of IndexSelectOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto index_dim = ctx->GetInputDim("Index");
-    auto dim = ctx->Attrs().Get<int>("dim");
-
-    PADDLE_ENFORCE_EQ(
-        dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            input_dim.size(), input_dim.size() - 1, dim));
-
-    PADDLE_ENFORCE_EQ(
-        index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "The 'shape' of Input(Index) must be 1-D tensor. "
-                  "But received: the 'shape' of Input(Index) is [%s], "
-                  "the dimension of Input(Index) is [%d].",
-                  index_dim, index_dim.size()));
-
-    PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
-                      platform::errors::InvalidArgument(
-                          "The length of Input(Index) can't be 0."));
-
-    auto output_dim = phi::vectorize(input_dim);
-    if (dim < 0) {
-      dim += input_dim.size();
-    }
-    output_dim[dim] = index_dim[0];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSelectInferMeta));
 REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
                   ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
+                  IndexSelectInferShapeFunctor);
 REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
                   ops::IndexSelectGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_select,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
deleted file mode 100644
index f810aee2adea540f1ffb6999ce38380ee05d0901..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/index_select_op.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input, T* output,
-                                         const IndexT* index, int64_t N,
-                                         int64_t stride, int64_t size,
-                                         int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  output[idx] = input[input_idx];
-}
-
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index, int64_t nums,
-                                              int64_t N, int64_t stride,
-                                              int64_t size, int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-
-template <typename DeviceContext, typename T>
-class IndexSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* out = context.Output<LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in->dims();
-    auto output_dim = out->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = out->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<LoDTensor>("Index");
-
-    auto* output_grad_data = output_grad->data<T>();
-    auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
-
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in_grad->dims();
-    auto output_dim = output_grad->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    int64_t numel = in_grad->numel();
-    int64_t index_nums = index->numel();
-    int64_t out_nums = output_grad->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_grad_cuda_kernel<T, int64_t><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_select,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 04b4f69add78513bf716ab03d3bc2ba86dfbad2d..684829be2697cdc1676e8b80e15b2d600d922f3b 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   output->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto inputs = *context.Input<framework::LoDTensor>("X");
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* output = context.Output<framework::LoDTensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += inputs.dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
-                                              dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
-                                                  output, dim);
-    }
-  }
-};
-
 template <typename DeviceContext, typename T, class Enable = void>
 struct IndexSelectAdd {
   void operator()(const framework::ExecutionContext& ctx, int slice_size,
@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
   x_grad->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* out_grad =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_grad->dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
-                                                  x_grad, dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
-                                                      *index, x_grad, dim);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index bce7a3c1caae39d21c9324b0f927401317284cc5..a232fba7e28d68c2df8394caa6bc5d93397f1f37 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 7f5136969980b887bb7bbe013690898e66abeac1..77951ff394e7491569746c89ac45826f23fdf313 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::InplaceABNOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index d61eb46d97e98972963f5871a4c6e7b06468337c..cd297c53f89a0f7efc622de7c385b9f75dc7462b 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
 
 template <typename T>
 __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
-    int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w,
-    const int in_img_w) {
-  src_w = (src_w > 0) ? src_w : 0.f;
-  *in_img_idx = static_cast<int>(src_w);
-  *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0;
-  *w1lambda = src_w - *in_img_idx;
-  *w2lambda = 1.f - *w1lambda;
+    int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x,
+    const int in_img_x) {
+  src_x = (src_x > 0) ? src_x : 0.f;
+  *in_img_idx = static_cast<int>(src_x);
+  *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
+  *lambda1 = src_x - *in_img_idx;
+  *lambda2 = 1.f - *lambda1;
 }
 
 struct FastDivModForInterpolate {
@@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory(
   }
 }
 
+__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height,
+                                             const int width, const int h,
+                                             const int w) {
+  return (nc * height + h) * width + w;
+}
+
+template <typename T>
+__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w,
+                                       const int out_h, const int out_w,
+                                       const int n, const int num_channels,
+                                       float ratio_h, float ratio_w,
+                                       const T* __restrict__ out,
+                                       const T align_type_value) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num_out = n * num_channels * out_h * out_w;
+  int num_in = n * num_channels * in_h * in_w;
+
+  for (; index < num_out; index += stride) {
+    int index_tmp = index;
+    int w2 = index_tmp % out_w;
+    index_tmp /= out_w;
+    int h2 = index_tmp % out_h;
+    int nc = index_tmp / out_h;
+
+    int h1, y_id;
+    T h1lambda, h0lambda;
+    T src_y = ratio_h * (h2 + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda,
+                                           src_y, in_h);
+    int w1, x_id;
+    T w1lambda, w0lambda;
+    T src_x = ratio_w * (w2 + align_type_value) - align_type_value;
+    PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda,
+                                           src_x, in_w);
+
+    T d2val = out[index];
+
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
+                            h0lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
+                            h0lambda * w1lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
+                            h1lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(
+        in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
+        h1lambda * w1lambda * d2val);
+  }
+}
+
 template <typename T>
 __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
                                    const T* __restrict__ out, const int out_h,
                                    const int out_w, const int n,
-                                   const int num_channels, float ratio_h,
-                                   float ratio_w, const T align_type_value,
-                                   bool is_nchw) {
+                                   const int out_chw, const int num_channels,
+                                   float ratio_h, float ratio_w,
+                                   const T align_type_value,
+                                   FastDivModForInterpolate divmods) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int in_chw = in_h * in_w * num_channels;
-  int out_chw = num_channels * out_h * out_w;
   int nthreads = n * out_chw;
 
-  if (is_nchw) {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int channel_id = out_id_w / out_img_size;
-      int out_img_idy = (out_id_w % out_img_size) / out_w;
-      int out_img_idx = tid % out_w;
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size +
-                      in_img_idy * in_w + in_img_idx];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id],
-                              h1lambda * w1lambda * value);
-    }
-  } else {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int out_img_idy = out_id_w / (out_w * num_channels);
-      int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
-      int channel_id = tid % num_channels;
-
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
-                      in_img_idx * num_channels + channel_id];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(
-          &in_pos[h_id * in_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * value);
-    }
+  for (; tid < nthreads; tid += stride) {
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
+
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
+
+    int in_img_idx, in_img_idy, w_id, h_id;
+    T w1lambda, h1lambda, w2lambda, h2lambda;
+    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
+                                           &w2lambda, src_w, in_w);
+    PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
+                                           &h2lambda, src_h, in_h);
+
+    T value = out[tid];
+    T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+    platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                            h2lambda * w1lambda * value);
+    platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                            h1lambda * w2lambda * value);
+    platform::CudaAtomicAdd(
+        &in_pos[h_id * in_w * num_channels + w_id * num_channels],
+        h1lambda * w1lambda * value);
   }
 }
 
@@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
           ratio_h, ratio_w, align_type_value, is_nchw);
+    } else if (!optimize_flag & is_nchw) {
+      //
+      const int num_kernels = n * c * out_h * out_w;
+      const int num_threads =
+          std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024);
+      KeBilinearInterpNCHWBw<
+          T><<<platform::DivUp(num_kernels, num_threads), num_threads, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w,
+          output_grad_data, align_type_value);
     } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
       KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                               ctx.cuda_device_context().stream()>>>(
-          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
-          ratio_h, ratio_w, align_type_value, is_nchw);
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods);
     }
   } else if ("bicubic" == interp_method) {
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 0ae7a9fa02f1fb217555ae41d8b25cbba0e43d19..1c79213757fdfa8d9ef0d7c7ab315d03f94b0c57 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -12,56 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct IscloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    // *out_data = true;
-    for (int i = 0; i < num; i++) {
-      out_data[i] = true;
-    }
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      // *out_data &= val;
-      out_data[i] = val;
-    }
-  }
-};
-
 class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -100,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", input_dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -154,12 +83,11 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
+                            PD_INFER_META(phi::ValueCompareInferMeta));
 REGISTER_OPERATOR(
     isclose, ops::IscloseOp, ops::IscloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::IscloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel<CPU, float>,
-                       ops::IscloseKernel<CPU, double>);
+    ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu
deleted file mode 100644
index 09710ba0c6957d39318abfc24113d4b9db11622d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isclose_op.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/isclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data,
-                                  const double rtol, const double atol,
-                                  bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    out_data[i] = val;
-    // if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct IscloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, num * sizeof(bool));
-#else
-    cudaMemset(out_data, true, num * sizeof(bool));
-#endif
-    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel<CUDA, float>,
-                        ops::IscloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h
deleted file mode 100644
index cde5d2afbf009a16a3d0c3601697703d8ec8eb7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct IscloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class IscloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    IscloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                       equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a78d8ec10149db5a1f8d585cb06bb08ea6ca5a5f..67c1942ea0b41e480c524f9c188b2a82649ba44e 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -9,10 +9,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,44 +23,6 @@ using framework::Tensor;
 class KLDivLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) rank and Input(Target) rank should be "
-                          "same, but received X rank(%d) != Target rank(%d)",
-                          dim_x.size(), dim_target.size()));
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            dim_x[i], dim_target[i],
-            platform::errors::InvalidArgument(
-                "Input(X) and Input(Target) should in same shape. but received "
-                "X dimension[%d](%d) != Target dimension[%d](%d)",
-                i, dim_x[i], i, dim_target[i]));
-      }
-    }
-
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    auto reduction_valid = "mean" == reduction || "sum" == reduction ||
-                           "batchmean" == reduction || "none" == reduction;
-    PADDLE_ENFORCE_EQ(
-        reduction_valid, true,
-        platform::errors::InvalidArgument(
-            "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
-
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -172,15 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
+                            PD_INFER_META(phi::KLDivInferMeta));
+
 REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                   ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
+                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
+                  KLDivInferShapeFunctor);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                   ops::KLDivLossGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu
deleted file mode 100644
index 5226cb8c08e3db4a0bfbbe4440c27264903f06e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
deleted file mode 100644
index 5a6ef06f5eb1e855c8a528664528c9919304c7b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using Array1 = Eigen::DSizes<int64_t, 1>;
-
-template <typename T>
-struct KLDivLossForward {
-  HOSTDEVICE KLDivLossForward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& input) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return target * (std::log(target) - input);
-    }
-  }
-};
-
-template <typename T>
-struct KLDivLossBackward {
-  HOSTDEVICE KLDivLossBackward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& grad) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return static_cast<T>(-1.) * grad;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    const int n = input->dims()[0];
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = framework::EigenVector<T>::Flatten(*input);
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
-    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
-    if ("none" == reduction) {
-      loss_t.device(place) = output;
-    } else if ("batchmean" == reduction) {
-      auto output_sum = output.sum();
-      if (n > 0) {
-        loss_t.device(place) = output_sum / output_sum.constant(n);
-      } else {
-        loss_t.device(place) = output_sum;
-      }
-    } else if ("mean" == reduction) {
-      loss_t.device(place) = output.mean();
-    } else if ("sum" == reduction) {
-      loss_t.device(place) = output.sum();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* target = ctx.Input<Tensor>("Target");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    const int n = input_grad->dims()[0];
-    const int numel = input_grad->numel();
-    const int expand = numel / loss_grad->numel();
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-
-    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
-
-    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
-    auto grad_t = target_t * loss_grad_expand;
-    input_grad_t.device(place) =
-        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
-
-    if ("mean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
-    } else if ("batchmean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index 322ae5df4cb877b4dde022e6c203a32cd8dd001d..eac181489aa9d09f4661c898b13e77570ad928a8 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 58d51ab1c723f296d3728a23de95a116acbb4df3..60390016d66e3addf0ead14f6b9209511324961c 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_y = ctx->GetInputDim("Y");
-    auto rank_x = dim_x.size();
-    auto rank_y = dim_y.size();
-    auto rank = (rank_x > rank_y) ? rank_x : rank_y;
-
-    std::vector<int64_t> dim_out;
-    dim_out.reserve(rank);
-    for (int i = 0; i < rank; i++) {
-      int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
-      int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
-      dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(dim_out));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -175,30 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor,
+                            PD_INFER_META(phi::KronInferMeta));
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                   ops::KronGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KronGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
+                  ops::KronGradOpMaker<paddle::imperative::OpBase>,
+                  KronInferShapeFunctor);
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
-REGISTER_OP_CPU_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu
deleted file mode 100644
index e5124e65007509568ae8cd8ab65b33c504a12fe9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kron_op.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
deleted file mode 100644
index 274b47c03a4d3d381dceda43d502a6e2d14669a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kron_op.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "thrust/device_vector.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-// Process an element in the output, used with a parallel-for
-template <typename T>
-struct KronElemFunctor {
-  KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b,
-                  const int64_t* stride_a, const int64_t* stride_b,
-                  const int64_t* stride_out, int ndims)
-      : a_(a),
-        b_(b),
-        out_(out),
-        shape_b_(shape_b),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        stride_out_(stride_out),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    // it computes 1 element in the output
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_out_[i];
-      index = index % stride_out_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-    out_[idx] = a_[index_a] * b_[index_b];
-  }
-
- private:
-  const T* a_;
-  const T* b_;
-  T* out_;
-  const int64_t* shape_b_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* stride_out_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x,
-                  const framework::Tensor& y, framework::Tensor* out) {
-    int ndims = out->dims().size();
-    int64_t numel = out->numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_out = out->dims();
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_out = phi::stride(dim_out);
-
-    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
-                  *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_out(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_out.Get(), stride_out.Get() + ndims,
-                 d_stride_out.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_out = stride_out.Get();
-    p_shape_y = dim_y.Get();
-#endif
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronElemFunctor<T> functor(x.data<T>(), y.data<T>(), out->data<T>(),
-                               p_shape_y, p_stride_x, p_stride_y, p_stride_out,
-                               ndims);
-    for_range(functor);
-  }
-};
-
-template <typename T>
-struct KronGradElemFunctor {
-  KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a,
-                      T* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
-    }
-  }
-
- private:
-  const T* dout_;
-  const T* A_;
-  const T* B_;
-  T* dout_a_;
-  T* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename T>
-struct KronGradElemFunctor<platform::complex<T>> {
-  KronGradElemFunctor(const platform::complex<T>* dout,
-                      const platform::complex<T>* A,
-                      const platform::complex<T>* B,
-                      platform::complex<T>* dout_a,
-                      platform::complex<T>* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] =
-          dout_[idx] *
-          platform::complex<T>(B_[index_b].real, -B_[index_b].imag);
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] =
-          dout_[idx] *
-          platform::complex<T>(A_[index_a].real, -A_[index_a].imag);
-    }
-  }
-
- private:
-  const platform::complex<T>* dout_;
-  const platform::complex<T>* A_;
-  const platform::complex<T>* B_;
-  platform::complex<T>* dout_a_;
-  platform::complex<T>* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronGradOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout,
-                  const framework::Tensor& x, const framework::Tensor& y,
-                  framework::Tensor* dx, framework::Tensor* dy) {
-    int ndims = dout.dims().size();
-    int64_t numel = dout.numel();
-    int64_t numel_x = x.numel();
-    int64_t numel_y = y.numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_dout = dout.dims();
-
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_dout = phi::stride(dim_dout);
-
-    const int64_t* p_stride_x = nullptr;
-    const int64_t* p_stride_y = nullptr;
-    const int64_t* p_stride_dout = nullptr;
-    const int64_t* p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_dout(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims,
-                 d_stride_dout.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_dout = stride_dout.Get();
-    p_shape_y = dim_y.Get();
-#endif
-    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
-    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
-    framework::Tensor dout_x;
-    T* p_dout_x = nullptr;
-    if (dx) {
-      dout_x.mutable_data<T>({numel_x, numel_y}, dev_ctx.GetPlace());
-      p_dout_x = dout_x.data<T>();
-    }
-    framework::Tensor dout_y;
-    T* p_dout_y = nullptr;
-    if (dy) {
-      dout_y.mutable_data<T>({numel_y, numel_x}, dev_ctx.GetPlace());
-      p_dout_y = dout_y.data<T>();
-    }
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronGradElemFunctor<T> func(dout.data<T>(), x.data<T>(), y.data<T>(),
-                                p_dout_x, p_dout_y, p_stride_dout, p_stride_x,
-                                p_stride_y, p_shape_y, numel_x, numel_y, ndims);
-    for_range(func);
-
-// reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
-    auto stream = dev_ctx.stream();  // it is a cuda device_context
-    if (dx) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-    if (dy) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-#else
-    auto* place = dev_ctx.eigen_device();
-    Eigen::array<int, 1> reduce_dim = {1};
-    if (dx) {
-      auto eigen_dout_x = framework::EigenMatrix<T>::Reshape(dout_x, 1);
-      auto eigen_vec_dx = framework::EigenVector<T>::Flatten(*dx);
-      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
-    }
-    if (dy) {
-      auto eigen_dout_y = framework::EigenMatrix<T>::Reshape(dout_y, 1);
-      auto eigen_vec_dy = framework::EigenVector<T>::Flatten(*dy);
-      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
-    }
-#endif
-  }
-};
-
-inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) {
-  const framework::DDim& shape = src.dims();
-  int rank = shape.size();
-  framework::Tensor res;
-  res.ShareDataWith(src);
-  PADDLE_ENFORCE_LE(
-      rank, ndims,
-      platform::errors::InvalidArgument(
-          "The input Tensor's rank should be less than or equal to ndims"
-          "Received input Tensor's rank = %d, ndims = %d",
-          rank, ndims));
-  if (rank < ndims) {
-    std::vector<int64_t> new_dim(ndims, 1);
-    for (int i = ndims - rank; i < ndims; i++) {
-      new_dim[i] = shape[i - ndims + rank];
-    }
-    res.Resize(phi::make_ddim(new_dim));
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T>
-class KronKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int ndims = out->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    KronOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, xx, yy, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KronGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int ndims = dout->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    framework::Tensor* pdxx = nullptr;
-    framework::Tensor* pdyy = nullptr;
-    framework::Tensor dxx;
-    framework::Tensor dyy;
-    if (dx) {
-      dxx = UnsqueezeTo(*dx, ndims);
-      pdxx = &dxx;
-    }
-
-    if (dy) {
-      dyy = UnsqueezeTo(*dy, ndims);
-      pdyy = &dyy;
-    }
-
-    KronGradOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, *dout, xx, yy, pdxx, pdyy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
index 2a79cee27814e86b277d927082e9a772359217f1..4c679d30263863c70176bebb686556af056068d0 100644
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/kthvalue_op.h"
 #include <memory>
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_LT(axis, dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(axis, -dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    if (axis < 0) axis += dim_size;
-    int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-    PADDLE_ENFORCE_GE(
-        k, 1, paddle::platform::errors::InvalidArgument(
-                  "the k in the kthvalue must >= 1, but received %d .", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of kthvalue must have >= 1d shape"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of kthvalue must have >= %d columns in axis of %d", k,
-              axis));
-    }
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor,
+                            PD_INFER_META(phi::KthvalueInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
                   ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
+                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>,
+                  KthvalueInferShapeFunctor);
 
 REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
deleted file mode 100644
index f6f56f70f1a11971b31e679ef879f2d1d0a96085..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ /dev/null
@@ -1,278 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/kthvalue_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-#endif
-
-namespace paddle {
-namespace operators {
-
-int getBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-bool SortKthvalue(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor* input_tensor, const int64_t num_cols,
-                  const int64_t num_rows, const int k,
-                  framework::Tensor* out_tensor,
-                  framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-  framework::Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-  framework::Tensor temp_values, temp_indices;
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-  temp_values.Resize(dim);
-  temp_indices.Resize(dim);
-  sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-  sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  auto err = cub::DeviceSegmentedRadixSort::SortPairs(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  framework::Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairs(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  auto& dev = *ctx.eigen_device();
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
-  auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-  auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
-      static_cast<const framework::Tensor>(temp_indices));
-  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
-  dim = phi::make_ddim(odims);
-  auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-  auto e_tmp_values = framework::EigenMatrix<T>::From(
-      static_cast<const framework::Tensor>(temp_values));
-
-  EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
-      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
-  EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
-      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-    const auto& in_dims = input->dims();
-    if (axis < 0) axis += in_dims.size();
-    auto out_dims = output->dims();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
-                                        input_height, k, output, indices),
-                        true, platform::errors::External(
-                                  "KthvalueOP: Error when use cub sorting"));
-      return;
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      framework::Tensor trans_ind, trans_out;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      PADDLE_ENFORCE_EQ(
-          SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                          &trans_out, &trans_ind),
-          true,
-          platform::errors::External("KthvalueOP: Error when use cub sorting"));
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-    if (axis < 0) axis += in_dims.size();
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = getBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h
deleted file mode 100644
index 15df0a10c6992f07f9913b867319bff342180c3d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kthvalue_op.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename Type>
-static void getKthvalue(Type input_height, Type input_width, int input_dim,
-                        const framework::Tensor* input, T* t_out,
-                        Type* t_indices, const int& k) {
-  bool partial_sort_flag = (k * 64) < input_width;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    } else {
-      std::nth_element(
-          col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    }
-    t_out[i] = col_vec[k - 1].first;
-    t_indices[i] = col_vec[k - 1].second;
-  }
-}
-
-template <typename T, typename Type>
-static void kthvalueAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                              output_data, indices_data, k);
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      framework::Tensor tmp_out, tmp_indices;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
-                              &trans_inp, t_out, t_ind, k);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      if (keepdim) {
-        kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
-                       indices, x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                       &indices_tmp, x_grad_data);
-      }
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      framework::Tensor trans_dO, trans_ind;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      if (keepdim) {
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans);
-      } else {
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-      kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 412ae3c49b5f3cc9fc2422aa220af324e6d99b69..c0a4b88fc76fd0d648b289e0d2f13536523f02d8 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(
-    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
-    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
-    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
-    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
-    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+                               const int cols, float epsilon, const T *x_ptr,
+                               const ScaleT *scale_ptr, const U *mean_ptr,
+                               const U *var_ptr, const T *dout_ptr, T *dx_ptr,
+                               ScaleT *dscale_ptr, ScaleT *dbias_ptr,
+                               const MaskType *mask_ptr = nullptr,
+                               T factor = static_cast<T>(0),
+                               T *d_dropout_src_ptr = nullptr) {
   auto stream = dev_ctx.stream();
   if (cols == 1024) {
     // step-1: compute dx and reduced part results of dscale and dbias.
@@ -1334,8 +1336,7 @@ static void LayerNormBackward(
     const U *mean, const U *var, T *d_x,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_bias, float epsilon,
-    int64_t batch_size, int64_t feature_size,
-    const platform::CUDADeviceContext &dev_ctx) {
+    int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) {
   auto stream = dev_ctx.stream();
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index e7d676479be0cc1176fa27c477bd35a5d6787cd3..224ab748dab6cdf8be246c4b400b4e55b6faf675 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
                   ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
                   ops::LayerNormGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
deleted file mode 100644
index dfe73d3727132ae9b8f71e2a415ef5193f303493..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias, const T *scale,
-                                               T *output, T *mean, T *variance,
-                                               int begin_norm_axis, float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        LayerNormForward<T, T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<U>(ctx.GetPlace());
-
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (void_scale_data != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-      if (void_bias_data != nullptr) {
-        PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                          framework::TransToProtoVarType(bias->dtype()),
-                          platform::errors::InvalidArgument(
-                              "Thie Scale and Bias of layer_norm op "
-                              "should have the same data type."));
-      }
-    } else {
-      scale_bias_dtype = (void_bias_data != nullptr
-                              ? framework::TransToProtoVarType(bias->dtype())
-                              : x_dtype);
-    }
-
-    bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-    if (!is_scale_bias_same_dtype_with_x) {
-      PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                        framework::DataTypeTrait<U>::DataType(),
-                        platform::errors::InvalidArgument(
-                            "Unsupported data type of Scale and Bias: %s",
-                            framework::DataTypeToString(scale_bias_dtype)));
-    }
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (GetDesiredBlockDim(feature_size)) {                            \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX><<<  \
-              batch_size, kBlockDim, 0, stream>>>(                         \
-              x_data, static_cast<const ScaleBiasT *>(void_scale_data),    \
-              static_cast<const ScaleBiasT *>(void_bias_data), y_data,     \
-              mean_data, var_data, epsilon, feature_size));                \
-      default:                                                             \
-        PADDLE_THROW(platform::errors::InvalidArgument(                    \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
-#ifdef PADDLE_WITH_CUDA
-    bool can_call_1024_kernel = false;
-    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-      can_call_1024_kernel = true;
-    }
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      const int grid = static_cast<int>(
-          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
-      if (is_scale_bias_same_dtype_with_x) {
-        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const T *>(void_scale_data),
-            static_cast<const T *>(void_bias_data), mean_data, var_data,
-            y_data);
-      } else {
-        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const U *>(void_scale_data),
-            static_cast<const U *>(void_bias_data), mean_data, var_data,
-            y_data);
-      }
-    } else {
-#endif
-      if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-      } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-
-    auto *mean_data = mean->data<U>();
-    auto *var_data = var->data<U>();
-
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (scale != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-    } else {
-      // FIXME(zengjinle): do not find a better way to get the right
-      // data type of the d_scale and d_bias if scale == nullptr.
-      auto *bias = ctx.Input<Tensor>("Bias");
-      if (bias != nullptr) {
-        scale_bias_dtype = framework::TransToProtoVarType(bias->dtype());
-      } else {
-        scale_bias_dtype = x_dtype;
-      }
-    }
-
-#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    auto *scale_data =                                                     \
-        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());          \
-    auto *d_scale_data =                                                   \
-        (d_scale == nullptr ? nullptr : d_scale->mutable_data<ScaleBiasT>( \
-                                            ctx.GetPlace()));              \
-    auto *d_bias_data =                                                    \
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<ScaleBiasT>(   \
-                                           ctx.GetPlace()));               \
-    auto *d_x_data =                                                       \
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace())); \
-    LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(                    \
-        x_data, d_y_data, scale_data, mean_data, var_data, d_x_data,       \
-        d_scale_data, d_bias_data, epsilon, batch_size, feature_size,      \
-        ctx.cuda_device_context());                                        \
-  } while (0)
-
-    if (scale_bias_dtype == x_dtype) {
-      PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
-    }
-
-#undef PADDLE_LAUNCH_LAYERNORM_BWD
-  }
-};
-
-template class LayerNormDirectCUDAFunctor<float>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#elif CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
deleted file mode 100644
index 9d70b7cf707437136bf358d31ea6fd4cc0f2a534..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  phi::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class LayerNormDirectCUDAFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* input,
-                  std::vector<int> input_shape, const T* bias, const T* scale,
-                  T* output, T* mean, T* variance, int begin_norm_axis,
-                  float eps);
-};
-#endif
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "mean's length (%d) is not equal with expected (%d).",
-                          mean->numel(), left));
-    PADDLE_ENFORCE_EQ(var->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "var's length (%d) is not equal with expected (%d).",
-                          var->numel(), left));
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(), right,
-          platform::errors::InvalidArgument(
-              "scale's length (%d) is not equal with expected (%d).",
-              scale->numel(), right));
-    }
-    if (bias) {
-      PADDLE_ENFORCE_EQ(
-          bias->numel(), right,
-          platform::errors::InvalidArgument(
-              "bias's length (%d) is not equal with expected (%d).",
-              bias->numel(), right));
-    }
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale ? scale->data<T>() : nullptr, bias ? bias->data<T>() : nullptr,
-        static_cast<int>(left), static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index c88880b43fff9fccd9764f145fba8ca4c61343c7..3c7e5bf9593e0ae2b3d8c04db1467c3b8fd1e174 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 0480a354c8bd8fdb81c95a576f57e9a12019ffc9..3b21a55f8df0dbb532729cf5cbca4c7362223b9c 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc
index 148fb05afcfd9a4ef1fcbc587a2bd33947a41000..72c6b41efa98922b4ba23fa4b6e1a83f931c701e 100644
--- a/paddle/fluid/operators/lgamma_op.cc
+++ b/paddle/fluid/operators/lgamma_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lgamma_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -35,16 +38,6 @@ $$out = log\Gamma(x)$$
 class LgammaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename T>
@@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker,
                   ops::LgammaGradMaker<paddle::framework::OpDesc>,
-                  ops::LgammaGradMaker<paddle::imperative::OpBase>);
+                  ops::LgammaGradMaker<paddle::imperative::OpBase>,
+                  LgammaInferShapeFunctor);
 
 REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CPUDeviceContext, double>)
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
deleted file mode 100644
index b9f273727b00bb5ec4398bf82b0a19737ee2387a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lgamma_op.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/lgamma_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CudaLgammaFunctor {
-  __device__ __forceinline__ T operator()(const T x) const {
-    return Eigen::numext::lgamma(x);
-  }
-};
-
-template <typename T>
-class LgammaKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h
deleted file mode 100644
index 674054e74573208ea9bbd537419d202e1a30d8c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lgamma_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LgammaFunctor {
-  LgammaFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = Eigen::numext::lgamma(input_[idx]);
-  }
-
- private:
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T>
-struct LgammaGradFunctor {
-  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class LgammaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace(),
-                                          size_t(x->numel() * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LgammaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index 0e69b397e04c7eda7f515350caf870be5d7b57a5..da38f906b9bd34ba6c3251059ee12902e62eadaf 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_softmax_op.h"
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMetaCheckAxis));
 REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker,
                   ops::LogSoftmaxOpInferVarType,
                   ops::LogSoftmaxGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>);
+                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>,
+                  LogSoftmaxInferShapeFunctor);
 REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    log_softmax,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    log_softmax_grad,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
deleted file mode 100644
index 8770abdac838f63b0c9f3a95b1ac0283a80ecbf2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ /dev/null
@@ -1,485 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <limits>
-#include "paddle/fluid/operators/log_softmax_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/functors.h"
-
-namespace paddle {
-namespace operators {
-
-#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxForwardInWarp<                                          \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        dst, src, outer_size, dim_size);                                     \
-    break;
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceSum(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = value + sum_val;
-  }
-  return value;
-}
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceMax(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = max(value, max_val);
-  }
-  return value;
-}
-
-int GetNearGreaterPowerOfTwo(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) {
-    ++log2_value;
-  }
-  return 1 << log2_value;
-}
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
-                                               int batch_size,
-                                               int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT elements[warp_iter];
-  // set effective_element_count as the num of elements when warps do effective
-  // work
-  // set effective_element_count as 0, when warps do ineffective work
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      elements[it] =
-          static_cast<AccT>(src[batch_id * element_count + element_index]);
-    } else {
-      elements[it] = -std::numeric_limits<AccT>::infinity();
-    }
-  }
-
-  // 2.compute max_value. For each thread, loop all registers to find max
-  AccT max_value = elements[0];
-#pragma unroll
-  for (int it = 1; it < warp_iter; ++it) {
-    max_value = (max_value > elements[it]) ? max_value : elements[it];
-  }
-  max_value = WarpReduceMax<AccT, kernel_warp_size>(max_value);
-
-  // 3.For each warp, accumulate all thread registers
-  AccT sum = 0.0f;
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    sum += std::exp(elements[it] - max_value);
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-  // 4.store result.
-  sum = std::log(sum);
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      dst[batch_id * element_count + element_index] =
-          static_cast<T>(elements[it] - max_value - sum);
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size,
-                                     int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_FORWAR_COMPUTE(1);
-    LAUNCH_WARP_FORWAR_COMPUTE(2);
-    LAUNCH_WARP_FORWAR_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_FORWAR_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_FORWAR_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_FORWAR_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_FORWAR_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_FORWAR_COMPUTE(128);   // dim_size 65~128
-    LAUNCH_WARP_FORWAR_COMPUTE(256);   // dim_size 129~256
-    LAUNCH_WARP_FORWAR_COMPUTE(512);   // dim_size 257~512
-    LAUNCH_WARP_FORWAR_COMPUTE(1024);  // dim_size 513~1024
-
-    default:
-      break;
-  }
-}
-
-// Returns the final item after reduce operation along block.x.
-// Firstly, get shared memory(smem) offset, find the starting position for every
-// y.
-// Secondly, initialise every smem position with value 'val' of thread itself.
-// Thirdly, apply standard reduction along x direction as below:
-//
-//   -> x direction
-// [o o o o o o o o]    time 0
-//  |     |/     /
-//  |    /|    /
-//  |  /  |  /
-//  |/    |/
-// [o o o o x x x x]    time 1
-//  | |/ /
-//  |/|/
-// [o o x x x x x x]    time 2
-//  |/
-// [o x x x x x x x]    time 3
-//
-// Finally, return the first item.
-// Imaging multiple reductions executed in paralell along y axis,
-// Note that when blockDim.x is not 1, it's a EVEN number in all cases,
-// and the size of shared memory is even as well.
-template <typename T, template <typename> class Functor>
-__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) {
-  Functor<T> func;
-  // This reduction is not Block-wise reduction, only reduce along block.x.
-  // therefore the shared mem has offsets for different block.y.
-  shared += threadIdx.y * blockDim.x;
-  shared[threadIdx.x] = val;
-  int offset = blockDim.x / 2;
-
-  while (offset > 0) {
-    __syncthreads();
-    if (threadIdx.x < offset) {
-      shared[threadIdx.x] =
-          func(shared[threadIdx.x], shared[threadIdx.x + offset]);
-    }
-    offset /= 2;
-  }
-  __syncthreads();
-  return shared[0];
-}
-
-template <typename T, typename AccT>
-__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis(
-    T *output, const T *input, int outer_size, int dim_size, int inner_size) {
-  extern __shared__ unsigned char smem[];
-  auto sdata = reinterpret_cast<AccT *>(smem);
-
-  const int outer_stride = inner_size * dim_size;
-  const int dim_stride = inner_size;
-
-  for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) {
-    for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size;
-         y_id += blockDim.y * gridDim.y) {
-      const int data_offset = x_id * outer_stride + y_id;
-      // When blockDim.x==1, no block.x-reduction opetaions are needed.
-      // And threadIdx.x is 0 all the time, so the for-loops below are literally
-      // loops (No parallel executions). Loop all elements along axis and
-      // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final
-      // log_softmax values along that axis.
-      // 1. reduce max
-      AccT max_value = -std::numeric_limits<AccT>::infinity();
-      // For one thread, iterate all items it responsable for, and get
-      // max_value.
-      // If there are N threads, N max_value will be returned.
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        const AccT value =
-            static_cast<AccT>(input[data_offset + d * dim_stride]);
-        max_value = phi::funcs::MaxFunctor<AccT>()(max_value, value);
-      }
-      // If there are more than 1 threads along block x, reduce all max_values
-      // and get the global max_value, which is the max value along "axis".
-      // If there is only one thread along block x, no need to reduce, as the
-      // 'max_value' is the global max_value.
-      if (blockDim.x > 1) {
-        max_value = BlockReduceAlongDimX<AccT, phi::funcs::MaxFunctor>(
-            sdata, max_value);
-      }
-
-      // 2. reduce sum
-      AccT sum = 0;
-      // Below is the same execution as '1. reduce max'
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        sum += std::exp(static_cast<AccT>(input[data_offset + d * dim_stride]) -
-                        max_value);
-      }
-      if (blockDim.x > 1) {
-        sum = BlockReduceAlongDimX<AccT, phi::funcs::AddFunctor>(sdata, sum);
-      }
-
-      // 3. input-max-log_sum and write to output
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        output[data_offset + d * dim_stride] = static_cast<T>(
-            static_cast<AccT>(input[data_offset + d * dim_stride]) - max_value -
-            std::log(sum));
-      }
-    }
-  }
-}
-
-// block.y covers inner_size. Threads along the x axis process dim_size
-// elements, and make sure not to exceed the 1024 threads per block.
-// Note that dim_threads namely blockDim.x is either 1 or a even number.
-inline dim3 GetBlockSize(int dim_size, int inner_size) {
-  int inner_threads = inner_size;
-  inner_threads = std::min(inner_threads, 1024);
-  int dim_threads = 1;
-
-  while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) {
-    dim_threads *= 2;
-  }
-  dim_threads /= 2;
-  return dim3(dim_threads, inner_threads);
-}
-
-// First cover the y axis as many blocks as possible.
-// Then cover the x axis as many blocks as possible,
-// and make sure not to exceed the max_active_blocks.
-inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size,
-                        int dim_size, int inner_size) {
-  int inner_blocks = (inner_size + block.y - 1) / block.y;
-  if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks;
-
-  int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
-  if (outer_blocks > outer_size) outer_blocks = outer_size;
-  return dim3(outer_blocks, inner_blocks);
-}
-
-// When designing grid size and block size, priority is given to block size,
-// and grid will be determined according to the maximum number of active blocks,
-// which is set by as a experience value.
-template <typename T, typename Kernel>
-void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size,
-                            int inner_size, dim3 &grid, dim3 &block,
-                            int &shared_mem, int num_sm) {
-  block = GetBlockSize(dim_size, inner_size);
-  int block_threads = block.x * block.y;
-  shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T);
-  int max_active_blocks = num_sm * 2;
-  grid =
-      GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
-}
-
-template <typename T, typename MPDType>
-void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
-                                                  const T *input_data,
-                                                  int outer_size, int dim_size,
-                                                  int inner_size, int num_sm,
-                                                  gpuStream_t stream) {
-  int shared_mem;
-  dim3 grid;
-  dim3 block;
-
-  ComputeLaunchConfigure<MPDType>(
-      &LogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>, outer_size, dim_size,
-      inner_size, grid, block, shared_mem, num_sm);
-
-  LogSoftmaxForwardCUDAKernelNotLastAxis<
-      T, MPDType><<<grid, block, shared_mem, stream>>>(
-      output_data, input_data, outer_size, dim_size, inner_size);
-}
-
-template <typename T>
-class LogSoftmaxKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *x = context.Input<framework::Tensor>("X");
-    auto *out = context.Output<framework::Tensor>("Out");
-    const auto *input_data = x->data<T>();
-    auto *output_data = out->mutable_data<T>(context.GetPlace());
-
-    const int rank = x->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    int dim_size = x->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < x->dims().size(); ++i) {
-      inner_size *= x->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, x->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-    int num_sm = context.cuda_device_context().GetSMCount();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxForwardForLastAxis<T, MPDType>(output_data, input_data,
-                                                  dim_size, outer_size, stream);
-    } else {
-      LaunchLogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>(
-          output_data, input_data, outer_size, dim_size, inner_size, num_sm,
-          stream);
-    }
-  }
-};
-
-// Backward below
-#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxBackwardInWarp<                                         \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        output, grad_output, grad_input, outer_size, dim_size);              \
-    break;
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
-                                                const T *grad_output,
-                                                T *grad_input, int batch_size,
-                                                int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT output_register[warp_iter];
-  AccT grad_output_register[warp_iter];
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      output_register[iter] =
-          static_cast<AccT>(output[batch_id * element_count + element_index]);
-      grad_output_register[iter] = static_cast<AccT>(
-          grad_output[batch_id * element_count + element_index]);
-    } else {
-      output_register[iter] = static_cast<AccT>(0);
-      grad_output_register[iter] = static_cast<AccT>(0);
-    }
-  }
-
-  // 2. For each warp, accumulate all thread registers
-  AccT sum = grad_output_register[0];
-#pragma unroll
-  for (int iter = 1; iter < warp_iter; ++iter) {
-    sum += grad_output_register[iter];
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-// 3. write result in grad_input
-#pragma unroll
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      grad_input[batch_id * element_count + element_index] = static_cast<T>(
-          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
-                                      const T *output, int dim_size,
-                                      int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
-    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
-    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
-    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
-    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
-    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
-
-    default:
-      break;
-  }
-}
-
-template <typename T>
-class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *out = context.Input<framework::Tensor>("Out");
-    const auto *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    const auto *out_data = out->data<T>();
-    const auto *d_out_data = d_out->data<T>();
-    auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int rank = out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    int dim_size = out->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < out->dims().size(); ++i) {
-      inner_size *= out->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, out->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
-          d_x_data, d_out_data, out_data, dim_size, outer_size, stream);
-    } else {
-      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
-          context.template device_context<platform::CUDADeviceContext>(), out,
-          d_out, d_x, axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
deleted file mode 100644
index 162087a75662d711a63cbbe4beeaecf265367c6a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_softmax_op.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline size_t SizeToAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = static_cast<T>(-64.);
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
-                  framework::Tensor* Y, const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    constexpr int kAxisDim = 1;
-
-    int axis_dim = X->dims()[axis];
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto logits = EigenMatrix<T>::From(*X, dim_2d);
-    auto log_softmax = EigenMatrix<T>::From(*Y, dim_2d);
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_axis(kAxisDim);
-    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
-    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-    // For numerical stability, logits should be shifted by maximum number along
-    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
-    if (num_remain == 1) {
-      // axis == -1, axis and class in same dimension, calculate along
-      // class dimension directly for higher performance
-      log_softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .eval()
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
-              .unaryExpr(ValueClip<T>());
-    } else {
-      // axis != -1, class dimension split into (axis, remain), max and sum
-      // should be calculated along axis dimension
-      log_softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
-              .unaryExpr(ValueClip<T>());
-    }
-
-    log_softmax.device(*context.eigen_device()) =
-        log_softmax -
-        log_softmax.exp()
-            .eval()
-            .reshape(batch_axis_remain)
-            .sum(along_axis)
-            .log()
-            .broadcast(one_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Output<framework::Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    if (X->numel() != 0) {
-      LogSoftmaxFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), X, Out, axis);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxGradFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* Y,
-                  const framework::Tensor* dY, framework::Tensor* dX,
-                  const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int n = SizeToAxis(axis, Y->dims());
-    const int d = SizeFromAxis(axis, Y->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto y = EigenMatrix<T>::From(*Y, dim_2d);
-    auto dy = EigenMatrix<T>::From(*dY, dim_2d);
-    auto dx = EigenMatrix<T>::From(*dX, dim_2d);
-
-    const int axis_dim = Y->dims()[axis];
-    const int batch_size = y.dimension(kBatchDim);
-    const int num_classes = y.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-    dx.device(*context.eigen_device()) =
-        dy -
-        (y.exp()) * (dy.reshape(batch_axis_remain)
-                         .sum(along_class)
-                         .broadcast(one_axis));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    const int rank = Out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    if (Out->numel() != 0) {
-      LogSoftmaxGradFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), Out, dOut, dX,
-          axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
index 5795f1dffac785b82662cebb84e8224cec78ecf6..6ce21aec9215a007ac6ca49ee1bffc1a40d40c81 100644
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/log_softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
     auto* X = ctx.Input<framework::Tensor>("X");
     auto* Out = ctx.Output<framework::Tensor>("Out");
     const int rank = X->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     Out->mutable_data<T>(ctx.GetPlace());
 
     if (X->numel() != 0) {
@@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
     auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     const int rank = dOut->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     dX->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 65297abe3e49b8f0fe85e4102d8f449c2c02788f..88d70d9bb7dae50f9ca0d82ce53896632b8b00ed 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel {
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
       auto dl = framework::StringToDataLayout(data_format);
-      // Some models may have intentionally set "AnyLayout" for pool
+      // Some models may have intentionally set "AnyLayout" for lrn
       // op. Treat this as NCHW (default data_format value)
       if (dl != framework::DataLayout::kAnyLayout) {
         return framework::OpKernelType(expected_kernel_type.data_type_,
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index 92c9857f0b942f00c348a6199ea4b9789b398328..10e2867bf2953f5c6fbc3d50bd8156fa3b0266e9 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -17,9 +17,11 @@
 
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/lstsq_op.h"
 #include "paddle/fluid/operators/qr_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
     Tensor tau = dito.Fill(tau_dims_vec, 0);
     auto tau_data = tau.mutable_data<T>(context.GetPlace());
 
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m >= n) {
       Tensor tmp_x = dito.Transpose(new_x);
       Tensor tmp_y = dito.Transpose(new_y);
@@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
       Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});
 
       // Step 3, solve R X = Y
-      triangular_solve<DeviceContext, T>(dev_ctx, res_r, slice_y, solution,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, res_r, slice_y, true,
+                                             false, false, solution);
+
     } else {
       auto x_data = new_x.mutable_data<T>(context.GetPlace());
       auto y_data = new_y.mutable_data<T>(context.GetPlace());
@@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
 
       // Step 2, solve R^H Z = Y
       Tensor trans_r = dito.Transpose(new_x);
-      triangular_solve<DeviceContext, T>(dev_ctx, trans_r, new_y, solution,
-                                         true, true, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, trans_r, new_y, true,
+                                             true, false, solution);
 
       // Step 3, X <- Q Z
       BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data,
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 3cbbc62e7bec92f329535e788f19d439c9341a0e..520722dafcbea3ce8c545389317516cc22f7689f 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -22,7 +22,6 @@
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 0c8414edc0f3ea7cb6b214cbbd8976a8457abefc..939558c710a3a2e26523e39f5cf3f2ac9b444cb5 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -15,12 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -406,11 +407,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
   const auto W = udims[udims.size() - 1];
   auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
   platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
+                                               L_dataptr);
   x_for_range(tril_computer);
 
-  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
-                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+  phi::funcs::TrilTriuCompute<T> triu_computer(
+      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
   x_for_range(triu_computer);
 
   // set L's diagonal 1
@@ -534,15 +536,15 @@ class LUGradKernel : public framework::OpKernel<T> {
     auto phil_rank = LmHdims.size();
     auto phiu_rank = UmHdims.size();
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
-    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
-                                     LmHdims[phil_rank - 2],
-                                     LmHdims[phil_rank - 1], phi_L.data<T>());
+    phi::funcs::TrilTriuCompute<T> tril_computer(
+        phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
+        LmHdims[phil_rank - 1], phi_L.data<T>());
     l_for_range(tril_computer);
 
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
-    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
-                                     UmHdims[phiu_rank - 2],
-                                     UmHdims[phiu_rank - 1], phi_U.data<T>());
+    phi::funcs::TrilTriuCompute<T> triu_computer(
+        phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
+        UmHdims[phiu_rank - 1], phi_U.data<T>());
     u_for_range(triu_computer);
 
     Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
@@ -558,6 +560,11 @@ class LUGradKernel : public framework::OpKernel<T> {
 
     framework::Tensor Pmat;
     Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, &Pmat, m, k);
+
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m <= n) {
       if (k < n) {
         framework::Tensor U_complement, U_grad_complement, phi_complement,
@@ -588,8 +595,9 @@ class LUGradKernel : public framework::OpKernel<T> {
         const auto W = phidims[phidims.size() - 1];
         platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                       phi_complement.numel());
-        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
-                                         W, phi_complement_l.data<T>());
+        phi::funcs::TrilTriuCompute<T> tril_computer(
+            phi_complement.data<T>(), -1, true, H, W,
+            phi_complement_l.data<T>());
         x_for_range(tril_computer);
 
         Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
@@ -608,8 +616,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       framework::Tensor psi_principal, phi_mH, psi_tmp;
       Tensor_Conj<DeviceContext, T>(dev_ctx, phi, &phi_mH);
       phi_mH = helper.Transpose(phi_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow, phi_mH,
-                                         &psi_principal, true, false, false);
+
+      phi::TriangularSolveKernel<T, Context>(
+          phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, psi_principal, &psi_principal);
       psi_principal = helper.Transpose(psi_principal);
@@ -623,8 +632,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &psi_principal,
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
-                                         true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, psi,
+                                             true, false, true, &psi_tmp);
 
       auto mat_dim_p =
           phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
@@ -659,8 +669,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       const auto W = phidims[phidims.size() - 1];
       platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                     phi_complement.numel());
-      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
-                                       phi_complement_u.data<T>());
+      phi::funcs::TrilTriuCompute<T> triu_computer(
+          phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
       x_for_range(triu_computer);
 
       Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
@@ -675,8 +685,10 @@ class LUGradKernel : public framework::OpKernel<T> {
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
       framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH;
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, phi,
-                                         &psi_principal, true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, phi,
+                                             true, false, true, &psi_principal);
+
       slice_starts[0] = 0;
       slice_starts[1] = 0;
       slice_ends[0] = k;
@@ -698,8 +710,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       psi_tmp = helper.Transpose(psi_tmp);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, U_narrow, &U_narrow_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow_mH, psi_tmp, &psi,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, U_narrow_mH, psi_tmp,
+                                             true, false, false, &psi);
       *dx = helper.Transpose(psi);
     }
   }
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
index d2303f2c08da8e98053e314f9756e4e375e27775..e4100867dc685ef68cd01b22ab7972aa8b436a06 100644
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     auto W = ldims[ldims.size() - 1];
     auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
+                                                 L_dataptr);
     l_for_range(tril_computer);
 
     const auto udims = du->dims();
@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     W = udims[udims.size() - 1];
     auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
+                                                 U_dataptr);
     u_for_range(triu_computer);
 
     auto xdims = dx->dims();
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
index a6eb535c693b8422a7b066618cbfddeddd751387..1887bbcfb7efdcf43e0cc020d773268312523505 100644
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
-
-    // output will only be a 1-D Tensor
-    ctx->SetOutputDim("Y", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor,
+                            PD_INFER_META(phi::MaskedSelectInferMeta));
+
 REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
                   ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>,
+                  MaksedSelectInferShapeFunctor);
 REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
                   ops::MaskedSelectedGradNoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 31a98d9f630e1c01f3b886cbe91dd3882b384d05..af1069cb867993160d7346779d7de8161e37438c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -20,7 +20,6 @@ math_library(sampler DEPS generator)
 
 # math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
-math_library(pooling)
 
 if(WITH_MKLDNN)
     math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 9994ccc10cb13b2f692b18f16182f6bcdad7efa5..b77e23450360c836ae3efe0a6dc2c77216e660f0 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -34,10 +34,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+template <typename InputIterator, typename OutputIterator, typename BinaryOp,
+          typename Context>
 static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
-                             size_t n, BinaryOp op,
-                             const platform::CUDADeviceContext &dev_ctx) {
+                             size_t n, BinaryOp op, const Context &dev_ctx) {
   memory::AllocationPtr allocation;
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
@@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
                                   size_t inner_dim, T init, BinaryOp op,
-                                  bool reverse,
-                                  const platform::CUDADeviceContext &dev_ctx) {
+                                  bool reverse, const Context &dev_ctx) {
   constexpr size_t kThreadNumX = 16;
   constexpr size_t kThreadNumY = 32;
 
@@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
                    size_t inner_dim, T init, BinaryOp op, bool reverse,
-                   const platform::CUDADeviceContext &dev_ctx) {
+                   const Context &dev_ctx) {
   if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
 
   if (outer_dim == 1 && inner_dim == 1) {
@@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
       CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
     }
   } else if (inner_dim != 1) {
-    platform::ForRange<platform::CUDADeviceContext> for_range(
-        dev_ctx, outer_dim * inner_dim);
+    platform::ForRange<Context> for_range(dev_ctx, outer_dim * inner_dim);
     if (reverse) {
       for_range(
           InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 883ee9b148654f8621b26942739730426ba7fc7d..7b239b8166644697581d0051f12b6abacc6832fa 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -34,45 +34,6 @@ class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
 template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
 template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor* a, framework::Tensor* b, bool left,
-                  bool upper, bool transpose, bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int i = 0; i < batch_size; i++) {
-      blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
-                b_data + i * N * M, ldb);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CPUDeviceContext, float>;
-template class TriangularSolveFunctor<platform::CPUDeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index d3490ead212731f3fc6a75d61a31c11c72c9129d..737196dde1dfc26269fe083fe17037c829ef8109 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -161,67 +161,6 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
 template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
 template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const Tensor* a,
-                  Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-    if (batch_size <= 8 && M >= 64) {
-      for (auto i = 0; i < batch_size; i++) {
-        blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                  a_data + i * M * M, lda, b_data + i * N * M, ldb);
-      }
-    } else {
-      std::vector<const T*> cpu_ptrs(batch_size * 2);
-      for (int i = 0; i < batch_size; ++i) {
-        cpu_ptrs[i] = a_data + i * M * M;
-        cpu_ptrs[i + batch_size] = b_data + i * M * N;
-      }
-
-      // Copy the addresses of A and tmp_b from host to device.
-      memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-          memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-      memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                   platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                   cpu_ptrs.size() * sizeof(T*), context.stream());
-
-      const T** gpu_a_ptrs =
-          reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
-      T** gpu_b_ptrs =
-          reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-      blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                       gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CUDADeviceContext, float>;
-template class TriangularSolveFunctor<platform::CUDADeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 1dc43205592f69cc105b43fe49b2f7872f8251c3..415d0c6dd8e0cf51958783c32aa49c66cce9e15c 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -117,14 +117,6 @@ class MatrixSolveFunctor {
                   const framework::Tensor& b, framework::Tensor* out);
 };
 
-template <typename DeviceContext, typename T>
-class TriangularSolveFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor* a,
-                  framework::Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular);
-};
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
deleted file mode 100644
index dfd3dad38644b65ef0b5e62e1b54ce210e9c489a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/pooling.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) {}
-};
-
-template <class T>
-class AvgPool {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  MT intermediate_res;
-
- public:
-  DEVICE inline T initial() {
-    intermediate_res = static_cast<MT>(0.0f);
-    return static_cast<T>(0);
-  }
-
-  DEVICE inline void compute(const T& x, T* y) {
-    intermediate_res += static_cast<MT>(x);
-  }
-
-  DEVICE inline void finalize(const T& pool_field, T* y) {
-    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
-  }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  static constexpr bool use_x = true;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += dy * static_cast<T>(x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  static constexpr bool use_x = false;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += (scale * dy);
-  }
-};
-
-/* used for adaptive pool to calculate start and end index of each divided grid
- */
-HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
- * is the number of channels, H and W is the height and width of feature.
- * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
- * is the number of channels, D, H and W is the depth, height and width of
- * feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool2dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool3dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index cdf204628b638f877c92e35a8941487aa39b5427..56f65340ea999f48702294f912c4354d83990881 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -14,8 +14,11 @@
 
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,26 +26,6 @@ namespace operators {
 class MatrixPowerOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power");
-    auto dims = ctx->GetInputDim("X");
-    auto n_dim = dims.size();
-    PADDLE_ENFORCE_GE(n_dim, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions. But "
-                          "received a %d dimension tensor.",
-                          n_dim));
-    PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          dims[n_dim - 2], dims[n_dim - 1]));
-    ctx->SetOutputDim("Out", dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor,
+                            PD_INFER_META(phi::MatrixPowerInferMeta));
+
 REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerOpInferVarType,
                   ops::MatrixPowerGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>,
+                  MatrixPowerInferShapeFunctor);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 1f04875c2203b2af80aa3cb81aaf95fbb0a6fe6c..e7d08b6597360bb0431add6ae63eb99f401c8ce0 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_rank_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -70,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel {
         std::vector<int> x_batch_dims_array(max_dim);
         std::vector<int> tol_dims_array(max_dim);
         std::vector<int> out_dims_array(max_dim);
-        GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(),
-                               tol_dims_array.data(), out_dims_array.data(),
-                               max_dim, axis);
+        phi::funcs::GetBroadcastDimsArrays(
+            dim_x_batch, dim_tol, x_batch_dims_array.data(),
+            tol_dims_array.data(), out_dims_array.data(), max_dim, axis);
         ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array));
       }
     } else {
@@ -115,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename T>
-void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches,
-                      int rows, int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, rows);
-    Eigen::SelfAdjointEigenSolver<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m);
-    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
-    for (int j = 0; j < k; j++) {
-      *(eigenvalues_data + i * k + j) = eigenvalues[j];
-    }
-  }
-}
-
-template <typename T>
-void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows,
-              int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  Eigen::BDCSVD<
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-      svd;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, cols);
-    svd.compute(m);
-    auto res_s = svd.singularValues();
-    for (int j = 0; j < k; j++) {
-      eigenvalues_data[i * k + j] = res_s[j];
-    }
-  }
-}
-
-template <typename T>
-class MatrixRankCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    } else {
-      BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
-            context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T}, &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CPUDeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-
-    int axis = -1;
-    if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
-    } else {
-      ElementwiseComputeEx<phi::funcs::LessThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          phi::funcs::LessThanFunctor<T, int64_t>(), &compare_result);
-    }
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker);
-
-REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel<float>,
-                       ops::MatrixRankCPUKernel<double>);
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
deleted file mode 100644
index dccd716022d2ab74d3f6aa35aa70780ac4feba16..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ /dev/null
@@ -1,316 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/matrix_rank_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/compare_functors.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-DDim GetUDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 1] = k;
-  return phi::make_ddim(x_vec);
-}
-
-DDim GetVHDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 2] = k;
-  return phi::make_ddim(x_vec);
-}
-}  // namespace detail
-
-template <typename T>
-class MatrixRankGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    // Must Copy X once, because the gesvdj will destory the content when exit.
-    Tensor x_tmp;
-    paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batches);
-    int* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      SyevjBatched(dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data,
-                   info_ptr);
-      platform::ForRange<platform::CUDADeviceContext> for_range(
-          dev_ctx, eigenvalue_tensor.numel());
-      phi::funcs::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
-                                        eigenvalue_tensor.numel());
-      for_range(functor);
-    } else {
-      Tensor U, VH;
-      auto* u_data =
-          U.mutable_data<T>(detail::GetUDDim(dim_x, k), context.GetPlace());
-      auto* vh_data =
-          VH.mutable_data<T>(detail::GetVHDDim(dim_x, k), context.GetPlace());
-      GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data<T>(), vh_data,
-                    u_data, eigenvalue_data, info_ptr, 1);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T},
-                                   context.device_context(), &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CUDADeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-    int axis = -1;
-    ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
-                         platform::CUDADeviceContext, T, int64_t>(
-        context, &eigenvalue_tensor, &tol_tensor, axis,
-        phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-
-  void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                     int m, int n, int k, T* A, T* U, T* V, T* S, int* info,
-                     int thin_UV = 1) const;
-
-  void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                    int n, T* A, T* W, int* info) const;
-};
-
-template <>
-void MatrixRankGPUKernel<float>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, float* A, float* U, float* V, float* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, double* A, double* U, double* V, double* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    // check the error info
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<float>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A,
-    float* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  // matrix is saved as column-major in cusolver.
-  // numpy and torch use lower triangle to compute eigenvalues, so here use
-  // upper triangle
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A,
-    double* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  //  upper triangle of A is stored
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel<float>,
-                        ops::MatrixRankGPUKernel<double>);
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 90e6a36220ab04087cd02abd76f6c3598425573c..2e82b47e8da1c6eb6f4a05fc4f7f356110f9fff1 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace operators {
@@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     layer_norm_p->execute(astream, args);
     astream.wait();
 
-    y->set_layout(DataLayout::kMKLDNN);
+    y->set_layout(phi::DataLayout::kMKLDNN);
     y->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
@@ -150,4 +151,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 // TODO(jczaja): Enable FP32 when performance is good
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<float>,
                    ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index ab02d4cfed9d54f9d168f6088df3e41d3e3e7c54..1078b451c55bae09c1274fe6ce3f45d21574d5e1 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
+using framework::Tensor;
 using dnnl::memory;
 using dnnl::pooling_backward;
 using dnnl::pooling_forward;
@@ -83,11 +85,11 @@ class PoolingMKLDNNHandler
         phi::slice_ddim(input_dims, 2, input_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     const auto src_tz = phi::vectorize(input->dims());
     const auto dst_tz = phi::vectorize(output->dims());
@@ -173,11 +175,11 @@ class PoolingMKLDNNHandler
     framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     auto src_tz = phi::vectorize<int64_t>(in_x->dims());
     auto diff_src_tz = phi::vectorize<int64_t>(in_x_grad->dims());
diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
index bdb4fe1198a8e550cb22cf4d727f6b7da34c28fe..86ecb01c89af7e12cc2a3dbcf91740d65d0ac247 100644
--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -50,13 +50,8 @@ class PReluMKLDNNHandler
       if (weights->dims().size() != x->dims().size()) {
         auto new_weights_dims = std::vector<int64_t>(x->dims().size(), 1);
         if (mode == "channel") {
-          if (data_format == "NHWC") {
-            new_weights_dims[x->dims().size() - 1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          } else {
-            new_weights_dims[1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          }
+          new_weights_dims[1] =
+              *std::max_element(weights_dims.begin(), weights_dims.end());
         }
         weights_dims = std::move(new_weights_dims);
       }
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index e9dadd5ec937cd11c84777a582cc1f7ac9fc3c33..4090d5ffca801512e423b02bfda3dd1a1bc49f03 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
@@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 
+PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 9d0062e31388413fd4a441687631faebe8846c6e..0e988557df6262d4f924323084daf062aee75e0c 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -26,13 +26,14 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
 USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
+PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
 
 namespace paddle {
@@ -97,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) {
 
 TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
   framework::DDim dims({1, 4, 8, 512});           // NHWC shape
-  framework::DDim expected_dims({1, 512, 3, 7});  // NHWC expected shape
+  framework::DDim expected_dims({1, 512, 3, 7});  // NCHW expected shape
   platform::CPUPlace p;
   framework::Scope scope;
 
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index c7fb92cd5107cee12e0995948e320ef3ed616f4d..9c16ccb138f7da56568ce6224dc30deb5bbccb7f 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of ModeOp must have >= 1d shape"));
-    if (axis < 0) axis += dim_size;
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input shape should >= 1d"));
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
+                            PD_INFER_META(phi::ModeInferMeta));
 REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
                   ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(mode,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
-
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
+                  ModeInferShapeFunctor);
 REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
deleted file mode 100644
index 2bacda8afb0eb340c4c8d4068f3013e2adbc7f91..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mode_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mode_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-
-namespace paddle {
-namespace operators {
-
-int ComputeBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-void getModebySort(const platform::CUDADeviceContext& ctx,
-                   const framework::Tensor* input_tensor,
-                   const int64_t num_cols, const int64_t num_rows,
-                   T* out_tensor, int64_t* indices_tensor) {
-  framework::Tensor input_tmp;
-  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
-  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
-  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
-  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
-
-  for (int64_t i = 0; i < num_rows; ++i) {
-    T* begin = input_tmp_data + num_cols * i;
-    T* end = input_tmp_data + num_cols * (i + 1);
-    thrust::device_vector<int64_t> indices_data(num_cols);
-    thrust::sequence(thrust::device, indices_data.begin(),
-                     indices_data.begin() + num_cols);
-    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
-    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
-                                           begin + 1, 0, thrust::plus<int>(),
-                                           thrust::not_equal_to<T>());
-    thrust::device_vector<T> keys_data(unique);
-    thrust::device_vector<int64_t> cnts_data(unique);
-    thrust::reduce_by_key(thrust::device, begin, end,
-                          thrust::constant_iterator<int>(1), keys_data.begin(),
-                          cnts_data.begin());
-    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
-                                  cnts_data.begin() + unique);
-    T mode = keys_data[it - cnts_data.begin()];
-    int64_t counts = cnts_data[it - cnts_data.begin()];
-    auto pos = thrust::find(thrust::device, begin, end, mode);
-    int64_t index = indices_data[pos - begin + counts - 1];
-    out_tensor_ptr[i] = static_cast<T>(mode);
-    indices_tensor_ptr[i] = static_cast<int64_t>(index);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
-                       indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (int i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      // second step, tranpose the input
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
-      int ndims = trans_axis.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans_axis);
-      framework::Tensor trans_ind;
-      int64_t* trans_ind_data =
-          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
-      framework::Tensor trans_out;
-      T* trans_out_data =
-          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
-                       trans_out_data, trans_ind_data);
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans_axis);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    if (axis < 0) axis += in_dims.size();
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = ComputeBlockSize(post);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    mode_grad,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h
deleted file mode 100644
index 76d356ed16eb3f81b10d541230f49b73fd836543..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mode_op.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Type>
-static void getMode(Type input_height, Type input_width, int input_dim,
-                    const framework::Tensor* input, T* t_out, Type* t_indices) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-    T mode = 0;
-    int64_t indice = 0;
-    int64_t cur_freq = 0;
-    int64_t max_freq = 0;
-    for (int64_t i = 0; i < input_width; ++i) {
-      ++cur_freq;
-      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
-        if (cur_freq > max_freq) {
-          max_freq = cur_freq;
-          mode = col_vec[i].first;
-          indice = col_vec[i].second;
-        }
-        cur_freq = 0;
-      }
-    }
-    t_out[i] = mode;
-    t_indices[i] = indice;
-  }
-}
-
-template <typename T, typename Type>
-static void ModeAssign(const Type& input_height, const Type& input_width,
-                       const int& input_dim, const framework::Tensor* input,
-                       const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    // if axis is not the last dim, transpose it to the last dim, do the
-    // calculation,
-    // then tranpose it back to orginal axis.
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                          output_data, indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_input, trans_axis);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
-      framework::Tensor tmp_indices;
-      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
-                                                      context.GetPlace());
-
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
-                          &trans_input, t_out, t_ind);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans_axis);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      // allocate the memory for the input_grad
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      if (keepdim) {
-        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
-                   x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                   &indices_tmp, x_grad_data);
-      }
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      framework::DDim trans_shape(out_dims);
-      framework::DDim trans_in_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = out_dims[trans_axis[i]];
-        trans_in_shape[i] = in_dims[trans_axis[i]];
-      }
-      // transpose the out_grad, indices
-      framework::Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
-      framework::Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      if (keepdim) {
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans_axis);
-      } else {
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
-      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans_axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index b309e1b87ef9033bd4302cdad4ea60a64cbf02eb..5b107ce643df33af79230c30d784d1ad84c26666 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -16,77 +16,19 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 
-/**
- * @brief compute the output shape and check the input shape valid or not
- */
-inline framework::DDim ComputeAndCheckShape(
-    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims) {
-  const size_t n = inputs_dims.size();
-  auto first_dim = inputs_dims[0];
-
-  bool is_vector = false;
-  framework::DDim out_dim;
-
-  PADDLE_ENFORCE_LT(
-      first_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the first tensor is 1D of size n view it as a row vector (1, n)
-  if (first_dim.size() == 1) {
-    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
-    is_vector = true;
-  }
-
-  auto last_dim = inputs_dims[n - 1];
-  PADDLE_ENFORCE_LT(
-      last_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the last tensor is 1D of size n view it as a column vector (n, 1)
-  if (last_dim.size() == 1) {
-    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
-    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
-  } else {
-    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
-                        : phi::make_ddim({first_dim[0], last_dim[1]});
-  }
-
-  auto width = first_dim[1];
-  for (size_t i = 1; i < n - 1; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast<size_t>(2),
-                      platform::errors::InvalidArgument(
-                          "the input tensor of multi_dot op must be 2D."));
-
-    const auto& tmp_dim = inputs_dims[i];
-    PADDLE_ENFORCE_EQ(
-        tmp_dim[0], width,
-        platform::errors::InvalidArgument(
-            "the input matrix does not meet the multiplication requirements."));
-    width = tmp_dim[1];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      last_dim[0], width,
-      platform::errors::InvalidArgument(
-          "the input matrix does not meet the multiplication requirements."));
-
-  return out_dim;
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument
 class MultiDotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(1),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in multi_dot op should > 1."));
-    auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class MultiDotOpGrad : public framework::OperatorWithKernel {
@@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor,
+                            PD_INFER_META(phi::MultiDotInferMeta));
+
 REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
                   ops::MultiDotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>,
+                  MultiDotInferShapeFunctor);
+
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 313a479ea301bb2c7dac0d0a27ca6064de99536a..8771a6573cba044d182aced752d3a65c446ad32e 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/multiplex_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   ops::MultiplexGradMaker<paddle::framework::OpDesc>,
                   ops::MultiplexGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
deleted file mode 100644
index 0a32ee96fb6938157364dc717724ce9193286f27..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiplex_op.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
deleted file mode 100644
index 1d0a009edeedcad746853bb286af52cce474df87..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiplex_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5d394424d54f5291df2855041d5d7f943dbd43d0..51daccce0e8822a1eec25ac428e5a56c632805e2 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension.
 };
 
 class NormOp : public framework::OperatorWithKernel {
- public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp");
-    auto xdim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", xdim);
-
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      int axis = ctx->Attrs().Get<int>("axis");
-      if (axis < 0) axis = xdim.size() + axis;
-      xdim[axis] = 1;
-      ctx->SetOutputDim("Norm", xdim);
-    }
-  }
 };
 
 class NormOpGrad : public framework::OperatorWithKernel {
@@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor,
+                            PD_INFER_META(phi::NormInferMeta));
+
 REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
                   ops::NormOpGradOpMaker<paddle::framework::OpDesc>,
-                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>);
+                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>,
+                  NormInferShapeFunctor);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f7a3b82acf19fa79cbf5c632977e6ae533ae12b
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/number_count_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NumberCountOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx",
+                   "NumberCount");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count",
+                   "NumberCount");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // the dtype of the gate_idx should be same as int64
+    auto gate_idx_dtype =
+        OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx");
+
+    PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64,
+                      platform::errors::InvalidArgument(
+                          "The dtype of the gate_idx_dtype should be int64"));
+    return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace());
+  }
+};
+
+class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("gate_idx", "(Tensor) The input gate index tensor.");
+    AddOutput("Out", "(Tensor) The output expert count tensor.");
+    AddAttr<int>("upper_range", "（int), The number of experts.");
+
+    AddComment(R"DOC(number_count Operator.count gate indices.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel<int>,
+                       ops::NumberCountOpCPUKernel<int64_t>);
+
+REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp,
+                             ops::NumberCountOpMaker);
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..97e4b4f2845ae132c28d3bb71dcc8e73f02e193a
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/number_count_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1)
+#define PERTHREAD_EXPERTS 256
+#define WARP_SIZE 32
+
+const int CUDA_NUM_THREADS = 512;
+static inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void initialize_zero_kernel(T* data, const int length) {
+  CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast<T>(0); }
+}
+
+template <typename T>
+__global__ void NumberCount(const T* gate_idx, T* number_count,
+                            int64_t batch_size, int upper_range) {
+  int res_tmp[PERTHREAD_EXPERTS] = {0};
+  int expert_min = blockIdx.x * PERTHREAD_EXPERTS;
+  int expert_max = expert_min + PERTHREAD_EXPERTS;
+  if (expert_max > upper_range) {
+    expert_max = upper_range;
+  }
+  for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+    T idx = gate_idx[i];
+    if (idx == -1) {
+      continue;
+    }
+    if (idx < expert_min || idx >= expert_max) {
+      continue;
+    }
+    res_tmp[idx - expert_min] += 1;
+  }
+  for (int i = expert_min; i < expert_max; ++i) {
+    int x = res_tmp[i - expert_min];
+#pragma unroll
+    for (int j = 1; j < WARP_SIZE; j <<= 1) {
+#ifdef __HIPCC__
+      x = x + __shfl_down(x, j);
+#else
+      x = x + __shfl_down_sync(-1u, x, j);
+#endif
+    }
+    if (threadIdx.x % WARP_SIZE == 0) {
+      platform::CudaAtomicAdd(number_count + i, x);
+    }
+  }
+}
+
+template <typename T>
+class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto gate_idx = context.Input<LoDTensor>("gate_idx");
+    auto upper_range = context.Attr<int>("upper_range");
+    auto number_count = context.Output<LoDTensor>("Out");
+
+    int64_t batch_size = gate_idx->numel();
+    auto place = context.GetPlace();
+    const auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    framework::DDim out_dims = phi::make_ddim({upper_range});
+    auto out_data = number_count->mutable_data<T>(out_dims, place);
+    const T* gate_data = gate_idx->data<T>();
+
+    initialize_zero_kernel<
+        T><<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+        out_data, upper_range);
+
+    NumberCount<
+        T><<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
+        gate_data, out_data, batch_size, upper_range);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..95e64946fb8a2156fdb4cbae880ccf2c143447ed
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NumberCountOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support expert count op for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index e212f4e7e2b7d1ad7964cc9351f1c4e241d5a79e..122b6a8a80aac95ab98ad95ed3e6339684978d12 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,26 +26,6 @@ namespace operators {
 class OneHotV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 1."));
-
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    auto out_dims_vec = phi::vectorize(x_dims);
-    out_dims_vec.push_back(depth);
-    auto out_dims = phi::make_ddim(out_dims_vec);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
@@ -114,10 +98,12 @@ Out is a LoDTensor:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor,
+                            PD_INFER_META(phi::OneHotRawInferMeta));
+
 REGISTER_OPERATOR(
     one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    one_hot_v2, ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    OneHotInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
deleted file mode 100644
index 77e2a931e50de5b7775463fc7bbf6262e2ad4a53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotV2OpCUDAFunctor(const framework::LoDTensor* in,
-                        framework::LoDTensor* out, int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
-                                          &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index acf6baf50b418ae0fd68d64f52f80f47df1c60c3..e5702a37bb2b4a4180e209bb5e306be64830bd99 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5399ee36ba7ff4a983448d607c108db8870138c
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    size_t n = params.size();
+    PADDLE_ENFORCE_EQ(n, params_out.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          params_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(params[i], params_out[i],
+                        platform::errors::InvalidArgument(
+                            "The size of Input(Param) and Output(ParamOut) "
+                            "must be the same Tensors."));
+    }
+
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    PADDLE_ENFORCE_EQ(
+        n, grads.size(),
+        platform::errors::InvalidArgument(
+            "The size of Input(Grad) must be equal to Input(Param), but got "
+            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+            grads.size(), n));
+
+    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
+    PADDLE_ENFORCE_EQ(n, velocitys.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Velocity) must be equal to "
+                          "Input(Param), but got the size of Input(Velocity) "
+                          "is %d, the size of Input(Param) is %d.",
+                          velocitys.size(), n));
+
+    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
+    PADDLE_ENFORCE_EQ(
+        n, velocitys_out.size(),
+        platform::errors::InvalidArgument(
+            "The size of Output(VelocityOut) must be "
+            "equal to Input(Param), but got the size of Output(VelocityOut) is "
+            "%d, the size of Input(Param) is %d.",
+            velocitys_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i],
+                        platform::errors::InvalidArgument(
+                            "Input(Velocity) and Output(VelocityOut) must be "
+                            "the same Tensors."));
+    }
+
+    auto mu = ctx.Attr<float>("mu");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    if (lrs.size() != 1) {
+      PADDLE_ENFORCE_EQ(
+          n, lrs.size(),
+          platform::errors::InvalidArgument(
+              "If the size of Input(LearningRate) is not 1, the size of "
+              "Input(LearningRate) must be "
+              "equal to Input(Param), but got the size of Input(LearningRate) "
+              "is %d, the size of Input(Param) is %d.",
+              lrs.size(), n));
+    }
+    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto regularization_methods =
+        ctx.Attr<std::vector<std::string>>("regularization_method");
+    auto regularization_coeffs =
+        ctx.Attr<std::vector<float>>("regularization_coeff");
+    if (regularization_methods.size() != 0) {
+      PADDLE_ENFORCE_EQ(
+          n, regularization_methods.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_method) must be equal "
+              "to Input(Param), but got the size of "
+              "Attr(regularization_method) is %d, the size of Input(Param) is "
+              "%d.",
+              regularization_methods.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, regularization_coeffs.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_coeff) must be equal "
+              "to Input(Param), but got the size of Attr(regularization_coeff) "
+              "is %d, the size of Input(Param) is %d.",
+              regularization_coeffs.size(), n));
+    }
+
+    VLOG(5) << "use_nesterov: " << use_nesterov
+            << ",  regularization_methods.size(): "
+            << regularization_methods.size()
+            << ",  regularization_coeffs.size(): "
+            << regularization_coeffs.size();
+
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+
+    Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
+    MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
+
+    for (size_t idx = 0; idx < n; ++idx) {
+      RegularizationType regularization_flag =
+          regularization_methods.size() > 0 &&
+                  regularization_methods[idx] == "l2_decay"
+              ? RegularizationType::kL2DECAY
+              : RegularizationType::kNONE;
+      T regularization_coeff = static_cast<T>(0.0);
+      if (regularization_coeffs.size() != 0) {
+        regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
+      }
+
+      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
+      auto param_out = params_out[idx];
+      auto velocity_out = velocitys_out[idx];
+
+      auto grad = grads[idx];
+      Tensor regularized_grad;
+      MLUCnnlTensorDesc param_desc(*param_out);
+      if (regularization_flag == RegularizationType::kL2DECAY) {
+        regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+            param_out->dims(), dev_ctx);
+        MLUCnnlOpTensorDesc op_tensor_desc(
+            CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+        MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(),
+                          GetBasePtr(param_out), param_desc.get(),
+                          GetBasePtr(grad), param_desc.get(),
+                          GetBasePtr(&regularized_grad), ToCnnlDataType<T>(),
+                          regularization_coeff);
+      } else {
+        regularized_grad = *grad;
+      }
+      MLUCnnl::ApplyMomentum(ctx, param_desc.get(),
+                             GetBasePtr(&regularized_grad), use_nesterov,
+                             GetBasePtr(learning_rate), GetBasePtr(&mu_tensor),
+                             GetBasePtr(param_out), GetBasePtr(velocity_out));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel<float>,
+                       ops::MLUMergedMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index d0cb674b4049f988773accd4b0652d62a1be2287..adc4a2ffaf8c54d32c10fa47e27d86aef2f9c508 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -90,5 +90,5 @@ namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel<plat::float16>,
                        ops::PadNPUKernel<float>, ops::PadNPUKernel<int>);
 
-REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel<plat::float16>,
+REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel<plat::float16>,
                        ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
deleted file mode 100644
index 6335004e69a37109664940e4d3445e3694be9cc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pool_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/operator.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using DataLayout = platform::DataLayout;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-DataLayout getLayoutFromStr(std::string data_format) {
-  if (data_format == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (data_format == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (data_format == "NCDHW") {
-    return DataLayout::kNCDHW;
-  } else {
-    return DataLayout::kNCDHW;
-  }
-}
-
-template <typename T>
-class PoolCUDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    Tensor *output = ctx.Output<Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-
-    // -----------------transformed tensor ------------------------
-
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    DataLayout layout;
-
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, *input, &transformed_input, axis);
-
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    const T *tranformed_input_data = transformed_input.data<T>();
-    T *tranformed_output_data = transformed_output.mutable_data<T>(
-        transformed_output.dims(), ctx.GetPlace());
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-
-#ifdef PADDLE_WITH_HIP
-    char *pool_workspace;
-    size_t pool_worksize = 0;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-            cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data,
-        false, pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc,
-        tranformed_output_data));
-#endif
-    // add
-    if (data_format == str_NDHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 4, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, transformed_output, output, axis);
-    }
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN not support NHWC data layout
-    if (data_format == str_NHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, transformed_output, output, axis);
-    }
-#endif
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    const Tensor *output = ctx.Input<Tensor>("Out");
-    const Tensor *output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-#ifdef PADDLE_WITH_HIP
-    if (pooling_type == "max") {
-      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
-      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
-      auto &all_op_kernels =
-          paddle::framework::OperatorWithKernel::AllOpKernels();
-      std::string op_type = "pool2d_grad";
-      auto kernels_iter = all_op_kernels.find(op_type);
-      PADDLE_ENFORCE_NE(
-          kernels_iter, all_op_kernels.end(),
-          platform::errors::Unavailable(
-              "There are no kernels which are registered in the %s operator.",
-              op_type));
-      OpKernelMap &kernels = kernels_iter->second;
-      paddle::framework::OpKernelType expected_kernel_key(
-          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
-      auto kernel_iter = kernels.find(expected_kernel_key);
-      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                        platform::errors::NotFound(
-                            "Operator (%s) does not have kernel for %s.",
-                            op_type, KernelTypeToString(expected_kernel_key)));
-      std::unique_ptr<OpKernelFunc> kernel_func_(
-          new OpKernelFunc(kernel_iter->second));
-      (*kernel_func_)(ctx);
-      return;
-    }
-#endif
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    // ------- tensor grad --------------
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    Tensor transformed_input_grad(input_grad->type());
-    DataLayout layout;
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v3;
-      trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
-      trans4(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v2;
-      trans4_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v3;
-      trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-      transformed_output_grad = *output_grad;
-      transformed_input_grad = *input_grad;
-    }
-
-    const T *input_data = transformed_input.data<T>();
-    const T *output_data = transformed_output.data<T>();
-    const T *output_grad_data = transformed_output_grad.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      if (FLAGS_cudnn_deterministic) {
-        pooling_mode = PoolingMode::kMaximumDeterministic;
-      } else {
-        pooling_mode = PoolingMode::kMaximum;
-      }
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T *input_grad_data = transformed_input_grad.mutable_data<T>(
-          transformed_input_grad.dims(), ctx.GetPlace());
-// Because beta is zero, it is unnecessary to reset input_grad.
-#ifdef PADDLE_WITH_HIP
-      char *pool_workspace;
-      size_t pool_worksize = 0;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-              cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data, pool_workspace));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data));
-#endif
-
-      if (data_format == str_NDHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 4, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-            trans5_v4;
-        trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-      if (data_format == str_NHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-            trans4_v4;
-        trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolCUDNNOpKernel<T>::Compute(ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>);
-#else
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradGradOpKernel<float>,
-                   ops::PoolCUDNNGradGradOpKernel<double>,
-                   ops::PoolCUDNNGradGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index ae095c2fa7aaa95cf667898b63a90988eb83caf0..44f3d8090e565c1581a49387db4b834b1abf8b62 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,6 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -23,125 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-int PoolOutputSize(int input_size, int filter_size, int padding_1,
-                   int padding_2, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
-            stride +
-        1;
-  }
-  PADDLE_ENFORCE_GT(
-      output_size, 0,
-      platform::errors::InvalidArgument(
-          "the output size must be greater than 0. But received: "
-          "output_size = %d due to the settings of input_size(%d), "
-          "padding(%d,%d), "
-          "k_size(%d) and stride(%d). Please check again!",
-          output_size, input_size, padding_1, padding_2, filter_size, stride));
-  return output_size;
-}
-
-void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(
-      ctx->HasInput("X"), true,
-      platform::errors::NotFound("Input(X) of Pool operator is not found."));
-  PADDLE_ENFORCE_EQ(
-      ctx->HasOutput("Out"), true,
-      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
-
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
-  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
-  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-  bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
-  std::string data_format = ctx->Attrs().Get<std::string>("data_format");
-  std::string padding_algorithm =
-      ctx->Attrs().Get<std::string>("padding_algorithm");
-
-  auto in_x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      platform::errors::InvalidArgument(
-          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
-          "received: %u-D Tensor and it's shape is [%s].",
-          in_x_dims.size(), in_x_dims));
-
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() - ksize.size(), 2U,
-      platform::errors::InvalidArgument(
-          "the dimension of input minus the size of "
-          "Attr(ksize) must be euqal to 2 in Op(pool). "
-          "But received: the dimension of input minus the size "
-          "of Attr(ksize) is %d, the "
-          "input's dimension is %d, the shape of input "
-          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-          ksize.size(), phi::make_ddim(ksize)));
-
-  PADDLE_ENFORCE_EQ(
-      ksize.size(), strides.size(),
-      platform::errors::InvalidArgument(
-          "the size of Attr(ksize) and Attr(strides) in "
-          "Op(pool) must be equal. "
-          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-          ksize.size(), strides.size(), phi::make_ddim(ksize),
-          phi::make_ddim(strides)));
-
-  // MKL-DNN Kernels are using NCHW order of dims description
-  // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
-                            (data_format == "NHWC" || data_format == "NDHWC");
-
-  // update paddings if "SAME" or global_pooling
-  framework::DDim data_dims;
-  if (channel_last) {
-    data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-  } else {
-    data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-  }
-  UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                data_dims, strides, ksize);
-
-  if (global_pooling) {
-    UpdateKsize(&ksize, data_dims);
-  }
-
-  std::vector<int64_t> output_shape;
-  if (adaptive) {
-    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-  } else {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
-        output_shape.push_back(data_dims[i]);
-      } else {
-        output_shape.push_back(
-            PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
-                           paddings[2 * i + 1], strides[i], ceil_mode));
-      }
-    }
-  }
-
-  // output_N = input_N
-  output_shape.insert(output_shape.begin(), in_x_dims[0]);
-  // output_C = input_C
-  if (channel_last) {
-    output_shape.push_back(in_x_dims[in_x_dims.size() - 1]);
-  } else {
-    output_shape.insert(output_shape.begin() + 1, in_x_dims[1]);
-  }
-
-  ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-  ctx->ShareLoD("X", "Out");
-}
-
 bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // (jczaja): oneDNN is supporting only unchangable in size pool window
@@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    platform::errors::NotFound(
-                        "Input(X) of Pool Gradoperator is not found."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    platform::errors::NotFound(
-                        "Input(X@GRAD) of Pool Gradoperator is not found."));
-  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-}
-
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
@@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("pool2d_grad_grad");
+    grad_op->SetType("pool2d_double_grad");
     grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
     grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
     grad_op->SetAttrMap(this->Attrs());
@@ -692,35 +569,34 @@ Example:
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad,
+                            Pool2dDoubleGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+
 REGISTER_OPERATOR(
     pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool2dInferShapeFunctor);
 REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad,
                   ops::Pool2dOpGradGradMaker<paddle::framework::OpDesc>,
-                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp);
-
-REGISTER_OP_CPU_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>,
+                  Pool2dGradInferShapeFunctor);
+REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp,
+                  Pool2dDoubleGradInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
 
 REGISTER_OPERATOR(
     pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool3dInferShapeFunctor);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu
deleted file mode 100644
index 069ce0c1fda853b943a7b414a7a33d9aa6405a89..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_op.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool3d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index bea6506ee86dbfe3ac606a1e8e883bfbf2500f25..d48ac3bd358ef64271de69df4424399b427cfb82 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+// NOTE(Ruibiao): Difficult to remove code from this header file because too
+// many files rely on it through "mkldnn_reuse.h"
 
-#include <algorithm>
-#include <string>
-#include <vector>
+#pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
-template <typename T = int>
-inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
-                          const bool adaptive,
-                          const std::string padding_algorithm,
-                          const framework::DDim data_dims,
-                          const std::vector<T>& strides,
-                          const std::vector<T>& ksize) {
-  // set padding size == data_dims.size() * 2
-  auto data_shape = phi::vectorize<T>(data_dims);
-  if (static_cast<int>(paddings->size()) == data_dims.size()) {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T copy_pad = *(paddings->begin() + 2 * i);
-      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
-                      platform::errors::InvalidArgument(
-                          "Paddings size %d should be the same or twice as the "
-                          "pooling size %d.",
-                          paddings->size(), data_dims.size() * 2));
-  }
-
-  // when padding_algorithm is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
-      T pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
-                   static_cast<T>(0));
-      T pad_0 = pad_sum / 2;
-      T pad_1 = pad_sum - pad_0;
-      *(paddings->begin() + i * 2) = pad_0;
-      *(paddings->begin() + i * 2 + 1) = pad_1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-
-  // if global_pooling == true or adaptive == true, padding will be ignore
-  if (global_pooling || adaptive) {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-}
-
-template <typename T = int>
-inline void UpdateKsize(std::vector<T>* ksize,
-                        const framework::DDim data_dims) {
-  ksize->resize(static_cast<size_t>(data_dims.size()));
-  for (size_t i = 0; i < ksize->size(); ++i) {
-    *(ksize->begin() + i) = static_cast<T>(data_dims[i]);
-  }
-}
-
-inline int getReduceNum(const framework::Tensor& input,
-                        const framework::Tensor* output,
-                        const std::string data_format,
-                        std::vector<int>* reduce_dim) {
-  // data_format only can be NCHW
-  bool channel_last = (data_format == "NHWC");
-  if (channel_last) {
-    return 0;
-  }
-  int reduce_num = 0;
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  if ((output_height == 1) && (output_width == 1)) {
-    reduce_dim->push_back(2);
-    reduce_dim->push_back(3);
-    reduce_num = input.dims()[2] * input.dims()[3];
-  }
-  return reduce_num;
-}
-
-template <typename DeviceContext, typename T>
-class PoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    switch (ksize.size()) {
-      case 2: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          std::vector<int> reduce_dim;
-          int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim);
-          if (reduce_num > 0 &&
-              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#if defined(__HIPCC__) || defined(__NVCC__)
-            auto stream = dev_ctx.stream();
-            TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
-                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
-                reduce_dim, stream);
-#else  // for cpu
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-#endif
-          } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-          }
-        }
-      } break;
-      case 3: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         exclusive, adaptive, out, pool_process);
-        }
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
-
-      switch (ksize.size()) {
-        case 2: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
-                pool2d_backward;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool2dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool2d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        case 3: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
-                pool3d_backward;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool3dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool3d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradGradKernel : public PoolKernel<DeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolKernel<DeviceContext, T>::Compute(context);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index 08656e64231b61181583cb700f2cc3216e25e516..fa88d128a9a1d572414a6459933a8988cae1fda0 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
@@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     // inputs need with NHWC layout
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index bd26d6350d9c300949edb1a90b244a7c747dd7a9..0efcb8b7981c32e9f8d5a04f4fd4122d6725a49e 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
         platform::errors::InvalidArgument(
@@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
 
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index 402dd6c10803947f73e593d215d28246a81c6706..87c437d8a78e0122b0fc4f5a7dbf51612e40fbf2 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
 #include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive,
                               bool is_test) {
   if (pooltype == "max") {
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index d061f9ae05613491cbdbff3793b57a3d89d7d6e5..e0341f4a4b4716d0ee82c9437ddc4d8bd1e35fb2 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_with_index_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Mask) of Pooling should not be null."));
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-
-    PADDLE_ENFORCE(
-        in_x_dims.size() == 4 || in_x_dims.size() == 5,
-        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
-                                          "tensor but received %dD-Tensor",
-                                          in_x_dims.size()));
-
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        in_x_dims.size() - ksize.size(), 2U,
-        platform::errors::InvalidArgument(
-            "The input size %d minus the kernel size %d should equal to 2.",
-            in_x_dims.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "Strides size %d and pooling size %d should be the same.",
-            strides.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), paddings.size(),
-        platform::errors::InvalidArgument(
-            "Paddings size %d and pooling size %d should be the same.",
-            paddings.size(), ksize.size()));
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    if (adaptive) {
-      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-    } else {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) {
-          output_shape.push_back(in_x_dims[i + 2]);
-        } else {
-          output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                   paddings[i], strides[i]));
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->SetOutputDim("Mask", phi::make_ddim(output_shape));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Mask"), true,
-        platform::errors::InvalidArgument("Input(Mask) must not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) must not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "Output(X@GRAD) should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index,
+                            MaxPool2dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad,
+                            MaxPool2dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
+
 REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool2dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool2dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool2dWithIndexGradInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index,
+                            MaxPool3dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad,
+                            MaxPool3dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
 
 REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool3dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool3dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool3dWithIndexGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
deleted file mode 100644
index 5497dcbd9ce255f833df24989d7a76c40bcbca06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_with_index_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
deleted file mode 100644
index 6e51a833f5c89efc2621c0ccc3d08dc42b2733a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    Tensor* mask = context.Output<Tensor>("Mask");
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    switch (ksize.size()) {
-      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
-      }
-    }
-
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      phi::funcs::set_constant(device_ctx, in_x_grad, 0);
-
-      switch (ksize.size()) {
-        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool2d_backward;
-          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool3d_backward;
-          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 9bd6ae8bab829e2f200f697e10e7e54f398f8d73..de35f67405810180554bfd556f91b7501f9c4ba2 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -9,14 +9,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prelu_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
+framework::OpKernelType innerGetKernelTypeForVar(
+    const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) {
+#ifdef PADDLE_WITH_MKLDNN
+  auto isOneDNNKernelChosen =
+      (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN);
+  auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN);
+  auto isModelNHWC =
+      (paddle::platform::MKLDNNDeviceContext::tls()
+           .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC);
+  // All inputs (including alpha) need shape rotating
+  if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(),
+                                   framework::DataLayout::kNHWC);
+  }
+#endif
+  return framework::OpKernelType(expected_kernel_type.data_type_,
+                                 tensor.place(), tensor.layout());
+}
+
 class PReluOp : public framework::OperatorWithKernel {
  public:
   PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -24,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu");
-    OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu");
-
-    auto x_dim = ctx->GetInputDim("X");
-    std::string mode = ctx->Attrs().Get<std::string>("mode");
-    if (mode == "all") {
-      PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'all', size of weight Alpha must be one. "
-                            "But recevied alpha's size: %d.",
-                            product(ctx->GetInputDim("Alpha"))));
-    } else if (mode == "channel") {
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 2,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      const std::string data_format_str =
-          ctx->Attrs().Get<std::string>("data_format");
-      PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC",
-                        true,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', data_format must be one of "
-                            "NCHW and NHWC. But recevied data_format: %s",
-                            data_format_str));
-      if (data_format_str == "NCHW") {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[1]: %d",
-                product(ctx->GetInputDim("Alpha")), x_dim[1]));
-      } else {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[%d]: %d",
-                product(ctx->GetInputDim("Alpha")), x_rank - 1,
-                x_dim[x_rank - 1]));
-      }
-
-    } else if (mode == "element") {
-      auto alpha_dim = ctx->GetInputDim("Alpha");
-      auto alpha_rank = alpha_dim.size();
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'element', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      PADDLE_ENFORCE_EQ(
-          alpha_rank, x_rank,
-          platform::errors::InvalidArgument(
-              "For mode 'element', rank of weight Alpha must be ",
-              "equal to the rank of input(x). But recevied alpha's rank: %d, "
-              "x's rank: %d.",
-              alpha_rank, x_rank));
-      size_t x_product = 1;
-      size_t alpha_product = 1;
-      for (int64_t i = x_rank - 1; i > 0; i--) {
-        x_product *= x_dim[i];
-        alpha_product *= alpha_dim[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          alpha_product, x_product,
-          platform::errors::InvalidArgument(
-              "For mode 'element', the size of weight Alpha must be "
-              "equal to the size of input(x). But recevied alpha's size: %d, "
-              "x's size: %d.",
-              alpha_product, x_product));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
-          "But recevied "
-          "mode: '%s'.",
-          mode));
-    }
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -128,6 +64,12 @@ class PReluOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -212,6 +154,12 @@ class PReluGradOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 template <typename T>
@@ -236,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor,
+                            PD_INFER_META(phi::PReluInferMeta));
 REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
                   ops::PReluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PReluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::PReluGradOpMaker<paddle::imperative::OpBase>,
+                  PReluInferShapeFunctor);
 REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    prelu_grad, ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
deleted file mode 100644
index 12e55d042d7037606179cc06480e4f80f942d8a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/prelu.h"
-#include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define CUDA_NUM_THREADS 1024
-
-inline static int PADDLE_GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename DeviceContext, typename T>
-class CUDAPReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-
-    VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
-            << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
-
-    if (mode == "channel") {
-      bool channel_last = data_format == "NHWC";
-      size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
-      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
-      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], channel, channel_last,
-                         numel);
-    } else if (mode == "element") {
-      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
-      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], numel);
-    } else {
-      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
-      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
-                   o_ptr, numel);
-    }
-  }
-};
-
-enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar };
-
-template <typename T>
-__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
-                                  const T* dy_ptr, T* dx_ptr, T* dalpha_ptr,
-                                  size_t channel_num, size_t plane_size,
-                                  size_t spatial_size, size_t numel,
-                                  PRELU_MODE mode) {
-  CUDA_KERNEL_LOOP(index, numel) {
-    T scale;
-    if (mode == Element) {
-      size_t element_index = index % spatial_size;
-      scale = alpha_ptr[element_index];
-    } else if (mode == ChannelFirst) {
-      size_t temp = index / plane_size;
-      size_t channel_index = temp % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else if (mode == ChannelLast) {
-      size_t channel_index = index % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else {
-      scale = alpha_ptr[0];
-    }
-    T x = x_ptr[index];
-    T dy = dy_ptr[index];
-    T zero = static_cast<T>(0);
-    if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy;
-    if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy;
-  }
-}
-
-template <typename T>
-class PreluOpGradFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy,
-                  T* dx, T* dalpha, const framework::DDim& input_dims,
-                  PRELU_MODE mode) {
-    size_t numel = 1;
-    for (size_t i = 0; i < input_dims.size(); ++i) {
-      numel *= input_dims[i];
-    }
-    size_t plane_size = numel / input_dims[0] / input_dims[1];
-    size_t spatial_size = numel / input_dims[0];
-    size_t channel =
-        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
-
-    PReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel,
-        mode);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CUDAPReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-
-    const T* x_ptr = x->data<T>();
-    const T* alpha_ptr = alpha->data<T>();
-    const T* dy_ptr = dy->data<T>();
-    T* dx_ptr = dx ? dx->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dalpha_ptr =
-        dalpha ? dalpha->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    if (!dx && !dalpha) return;
-
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-    std::vector<int> input_shape = phi::vectorize<int>(dim);
-    auto stream = context.cuda_device_context().stream();
-
-    T* dalpha_tmp_ptr;
-    Tensor dalpha_tmp;
-    if (dalpha_ptr == nullptr) {
-      dalpha_tmp_ptr = dalpha_ptr;
-    } else {
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      dalpha_tmp = context.AllocateTmpTensor<T, DeviceContext>(dim, dev_ctx);
-      dalpha_tmp_ptr = dalpha_tmp.mutable_data<T>(context.GetPlace());
-    }
-
-    PRELU_MODE m;
-    bool channel_last = false;
-    if (mode == "element") {
-      m = Element;
-    } else if (mode == "channel") {
-      channel_last = data_format == "NHWC";
-      m = channel_last ? ChannelLast : ChannelFirst;
-    } else {
-      m = Scalar;
-    }
-    PreluOpGradFunctor<T> prelu_grad;
-    prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim,
-               m);
-
-    if (dalpha_tmp_ptr == nullptr) return;
-
-    std::vector<int> reduce_dims;
-    for (size_t i = 0; i < dim.size(); i++) {
-      if (mode == "channel" && !channel_last && i == 1) continue;
-      if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
-      if (mode == "element" && i != 0) continue;
-      reduce_dims.push_back(i);
-    }
-
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        context.cuda_device_context(), dalpha_tmp, dalpha,
-        kps::IdentityFunctor<T>(), reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    prelu_grad,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
deleted file mode 100644
index 384994eb37c2a955c383ddeebafe5f0e64d3c961..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename DeviceContext, typename T>
-class PReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (mode == "channel") {
-      if (data_format == "NCHW") {
-        int temp = 1;
-        for (int j = 2; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = (i / temp) % dim[1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          index = i % dim[dim.size() - 1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      }
-    } else if (mode == "element") {
-      int temp = 1;
-      for (int j = 1; j < dim.size(); j++) {
-        temp *= dim[j];
-      }
-      for (i = 0; i < numel; i++) {
-        index = i % temp;
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-      }
-    } else {
-      for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-    auto* alpha = context.Input<Tensor>("Alpha");
-    const T* alpha_ptr = alpha->data<T>();
-    const T* x_ptr = x->data<T>();
-    const T* dout_ptr = dout->data<T>();
-    std::string mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (dx) {
-      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dx_ptr[i] =
-              x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
-        }
-      }
-    }
-
-    index = 0;
-    if (dalpha) {
-      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
-      memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
-
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      }
-    }
-
-    // TODO(Guanzhong): add GPU kernels
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index da637dfeb237dd4f17816e784882720dc2f2ff64..cfacffff234105ac9c6dc41b86f06594d319dcbb 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(ROIs) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of PSROIPoolOp should not be null."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], 4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "the channel of X(%d) "
-            "should be equal to the product of "
-            "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-            input_dims[1], output_channels, pooled_height, pooled_width));
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      platform::errors::InvalidArgument(
-                          "The pooled output channels must greater than 1"));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of Out should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of X should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolGradInferMeta));
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
                   ops::PSROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>,
+                  PsroiPoolInferShapeFunctor);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp,
+                  PsroiPoolGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index c1917501db8b5afebf4b7951b0f04de69758b49d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(
-        input_channels, output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "The channels %d of input X should equal the product of "
-            "output_channels %d x pooled_height %d x pooled_width %d.",
-            input_channels, output_channels, pooled_height, pooled_width));
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    int rois_batch_size;
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                   rois_num_data, sizeof(int) * rois_batch_size, 0);
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_list[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-
-      // set rois batch id
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 3f020d93391b0e648898c1b83858a7bd9809aa03..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_data[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                            "batch_size should be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and lod must be the same"));
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 40e3cbde3b00917ee5952b8aebd412b357683018..82fc9ef1b7858992c49f537ce8608856ef6b6fde 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
 
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
 
-REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
-
 REGISTER_OP_CPU_KERNEL(
     qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index f09a07e96cd34e1b631ef9484fe23b12a3b58543..5ef02d8942797a720d18358d425cf45f77be82ad 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
   return std::make_tuple(compute_q, reduced);
 }
 
-template <typename T>
-class QrCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool compute_q;
-    bool reduced_mode;
-    const Tensor& x = *context.Input<Tensor>("X");
-    Tensor& q = *context.Output<Tensor>("Q");
-    Tensor& r = *context.Output<Tensor>("R");
-    std::string mode = context.Attr<std::string>("mode");
-    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
-
-    auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
-    auto x_dims = x.dims();
-    int x_rank = x_dims.size();
-    int m = x_dims[x_rank - 2];
-    int n = x_dims[x_rank - 1];
-    int min_mn = std::min(m, n);
-    int k = reduced_mode ? min_mn : m;
-    int batch_size = numel / (m * n);
-    int x_stride = m * n;
-    int q_stride = m * k;
-    int r_stride = k * n;
-
-    auto* x_data = x.data<phi::dtype::Real<T>>();
-    T* q_data = nullptr;
-    if (compute_q) {
-      q_data = q.mutable_data<phi::dtype::Real<T>>(
-          context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-      memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-    }
-    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
-        context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-
-    // Implement QR by calling Eigen
-    for (int i = 0; i < batch_size; ++i) {
-      const T* x_matrix_ptr = x_data + i * x_stride;
-      T* r_matrix_ptr = r_data + i * r_stride;
-      using EigenDynamicMatrix =
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
-      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
-      if (reduced_mode) {
-        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
-        auto r_matrix_view =
-            qr_top_matrix.template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      } else {
-        auto r_matrix_view =
-            qr.matrixQR().template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      }
-
-      if (compute_q) {
-        T* q_matrix_ptr = q_data + i * q_stride;
-        if (reduced_mode) {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        } else {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        }
-      }
-    }
-  }
-};
-
 template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 955cf8d4448c1b23319fa3e0c10dbd12ae3bf49c..9115d21b195e1b615f43b01af61bbdebd1e70294 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +32,17 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+class ReduceAllOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_all"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_all"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_all,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AllFunctor>);
+REGISTER_OPERATOR(
+    reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAllInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index fa3800dd3c9e46c20df54d748a61166a75be492b..69561b93498883bdf2adcfa3982d24bc1e727be0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +31,18 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+class ReduceAnyOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_any"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_any"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_any,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AnyFunctor>);
+REGISTER_OPERATOR(
+    reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAnyInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d057ee8f5d798f61c13d5c5c166c9d71b6716d6f..e327d19ab3be8daff08b4e358081d2792fd30835 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -35,7 +35,7 @@ namespace p = paddle::platform;
 
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(reduce_any);
+USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index 41df8e4a15f093a40a31c70eea98dfb7e575f4cd..15812778e0023e30a29f259bbd14b4c564ea8d46 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -35,13 +35,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
     ReduceMaxInferShapeFunctor);
 REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
deleted file mode 100644
index 5ee38b8fa46290c86cd44ef1bcc71bd2fcd9bcd4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 4a18330913803f822436118a35fb957b7e31b391..dc41979defb9314f2efb942f0f530c3b5da3bb8b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
-                       CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 11aa78382e319331dc65ec22927f0d5762adfb43..5e5b04d57b002d8e8ecab9ddaf8186118f4bf187 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -14,21 +14,24 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_min);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMinOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_min"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_min"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_min, ops::ReduceOp, ReduceMinOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMinInferShapeFunctor);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
deleted file mode 100644
index 44548b8d2e778e4a570d085be6f2538b64ab7824..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_min
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
deleted file mode 100644
index bf886063786a8c36884ed20fef41c99468156c01..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index eb745ab9c56c5b3cfa62eb36713ebc2485282d6d..b1abdf9e8a758008dff49176c2d6b6682de5b622 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -26,14 +30,20 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
-REGISTER_REDUCE_OP(reduce_prod);
+namespace ops = paddle::operators;
+
+class ReduceProdOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_prod"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_prod"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
-REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::ProdGradFunctor>);
+REGISTER_OPERATOR(
+    reduce_prod, ops::ReduceOp, ReduceProdOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceProdInferShapeFunctor);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
deleted file mode 100644
index 0610cdd94f89c0371988fac7955d07fc5498a69f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 5627b4f229e100d9979663e8688b8694188bab0f..bf78b6a696559cab152a6de2c4730a32dfdbb780 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -221,21 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
+
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                   ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                   ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_align)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
deleted file mode 100644
index 18941d10e937d3c28e5793384f00d9d97225a128..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_align_op.cu
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-static constexpr int kROISize = 4;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <class T>
-__device__ T BilinearInterpolate(const T* input_data, const int height,
-                                 const int width, T y, T x) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  int y_low = static_cast<int>(y);
-  int x_low = static_cast<int>(x);
-  int y_high;
-  int x_high;
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  T v1 = input_data[y_low * width + x_low];
-  T v2 = input_data[y_low * width + x_high];
-  T v3 = input_data[y_high * width + x_low];
-  T v4 = input_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
-                                            T y, T x, T* w1, T* w2, T* w3,
-                                            T* w4, int* x_low, int* x_high,
-                                            int* y_low, int* y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return;
-  }
-
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  *y_low = static_cast<int>(y);
-  *x_low = static_cast<int>(x);
-  if (*y_low >= height - 1) {
-    *y_high = *y_low = height - 1;
-    y = static_cast<T>(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = *x_low = width - 1;
-    x = static_cast<T>(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low, lx = x - *x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-__global__ void GPUROIAlignForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data,
-    const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-    T output_val = 0;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-    output_data[i] = output_val;
-  }
-}
-
-template <typename T>
-__global__ void GPUROIAlignBackward(
-    const int nthreads, const T* input_rois, const T* out_grad,
-    const int num_rois, const float spatial_scale, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
-    T* input_grad, const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? T(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_input_grad =
-        input_grad + (roi_batch_ind * channels + c) * height * width;
-
-    const T* offset_out_grad =
-        out_grad + (n * channels + c) * pooled_height * pooled_width;
-    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
-                                    &x_low, &x_high, &y_low, &y_high);
-        T diff1 = out_grad_this_bin * w1 / count;
-        T diff2 = out_grad_this_bin * w2 / count;
-        T diff3 = out_grad_this_bin * w3 / count;
-        T diff4 = out_grad_this_bin * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
-                                  diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
-                                  diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
-                                  diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
-                                  diff4);
-        }
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-#ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256);
-#endif
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(
-          lod.empty(), false,
-          platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
-                                            "not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and batch size "
-              "of images must be the same. But received rois batch size = %d, "
-              "and images batch size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()), aligned);
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    auto roi_ptr =
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-    int blocks = NumBlocks(output_grad_size);
-    int threads = kNumCUDAThreads;
-
-    if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
-          aligned);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_align,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
deleted file mode 100644
index e71099ed99f00f5846e6e23d5d39b3b2f8997531..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_align_op.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-namespace {  // NOLINT
-constexpr size_t get_offset(size_t x, size_t y, size_t width) {
-  return y * width + x;
-}
-
-template <class T>
-struct offsets_and_ratios {
-  offsets_and_ratios() = default;
-  offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy,
-                     std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio,
-                     T XY_ratio)
-      : xy(xy),
-        xY(xY),
-        Xy(Xy),
-        XY(XY),
-        xy_ratio(xy_ratio),
-        xY_ratio(xY_ratio),
-        Xy_ratio(Xy_ratio),
-        XY_ratio(XY_ratio) {}
-
-  std::size_t xy = 0;
-  std::size_t xY = 0;
-  std::size_t Xy = 0;
-  std::size_t XY = 0;
-  T xy_ratio = 0.0f;
-  T xY_ratio = 0.0f;
-  T Xy_ratio = 0.0f;
-  T XY_ratio = 0.0f;
-};
-
-template <typename T>
-std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
-    std::size_t width, std::size_t height, const T roi_width,
-    const T roi_height, const T roi_xmin, const T roi_ymin,
-    std::size_t pooled_width, std::size_t roi_bin_grid_w,
-    std::size_t pooled_height, std::size_t roi_bin_grid_h) {
-  const auto ind_num =
-      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
-
-  std::vector<offsets_and_ratios<T>> interpolation_cords;
-  interpolation_cords.reserve(ind_num);
-
-  const auto bin_w = roi_width / pooled_width;
-  const auto bin_h = roi_height / pooled_height;
-
-  for (std::size_t py = 0; py < pooled_height; py++) {
-    for (std::size_t px = 0; px < pooled_width; px++) {
-      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
-        // calculate x of sample points
-        auto y =
-            roi_ymin +
-            bin_h * (py +
-                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
-        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
-          // calculate x of sample points
-          auto x = roi_xmin +
-                   bin_w * (px +
-                            static_cast<T>(ix + .5f) /
-                                static_cast<T>(roi_bin_grid_w));
-
-          // deal with elements out of map
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            interpolation_cords.emplace_back();
-            continue;
-          }
-          y = y <= 0 ? 0 : y;
-          x = x <= 0 ? 0 : x;
-
-          std::size_t x_low_index = static_cast<std::size_t>(x);
-          std::size_t x_high_index;
-          if (x_low_index >= width - 1) {
-            x_high_index = x_low_index = width - 1;
-            x = static_cast<T>(x_low_index);
-          } else {
-            x_high_index = x_low_index + 1;
-          }
-          T x_ratio = x_high_index - x;
-
-          std::size_t y_low_index = static_cast<std::size_t>(y);
-          std::size_t y_high_index;
-          if (y_low_index >= height - 1) {
-            y_high_index = y_low_index = height - 1;
-            y = static_cast<T>(y_low_index);
-          } else {
-            y_high_index = y_low_index + 1;
-          }
-          T y_ratio = y_high_index - y;
-
-          auto xy = get_offset(x_low_index, y_low_index, width);
-          auto xY = get_offset(x_low_index, y_high_index, width);
-          auto Xy = get_offset(x_high_index, y_low_index, width);
-          auto XY = get_offset(x_high_index, y_high_index, width);
-
-          auto xy_ratio = x_ratio * y_ratio;
-          auto xY_ratio = x_ratio * (1 - y_ratio);
-          auto Xy_ratio = (1 - x_ratio) * y_ratio;
-          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
-
-          interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio,
-                                           Xy_ratio, XY_ratio);
-        }
-      }
-    }
-  }
-  return interpolation_cords;
-}  // namespace
-
-template <typename T>
-void interpolate(std::vector<T>& interpolated_values,  // NOLINT
-                 const std::vector<offsets_and_ratios<T>>& interpolation_cords,
-                 const T* data) {
-  for (auto& ic : interpolation_cords) {
-    auto xlyl_offset = ic.xy;
-    auto xhyl_offset = ic.Xy;
-    auto xlyh_offset = ic.xY;
-    auto xhyh_offset = ic.XY;
-
-    auto xlyl_ratio = ic.xy_ratio;
-    auto xhyl_ratio = ic.Xy_ratio;
-    auto xlyh_ratio = ic.xY_ratio;
-    auto xhyh_ratio = ic.XY_ratio;
-
-    interpolated_values.emplace_back(
-        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
-        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
-  }
-}
-
-template <typename T>
-void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
-              int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width,
-              int pooled_height) {
-  const auto data_amount = pooled_width * pooled_height;
-  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
-  const T count = 1.0 / grid_points;
-  auto val_begin = interpolated_values.cbegin();
-  for (auto i = 0; i < data_amount; ++i) {
-    T sum = 0.0;
-    auto val_end = val_begin + grid_points;
-    sum = std::accumulate(val_begin, val_end, sum);
-    val_begin = val_end;
-    output_data[i] = sum * count;
-  }
-}
-}  // NOLINT
-
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(lod.empty(), false,
-                        platform::errors::InvalidArgument(
-                            "Input(ROIs) Tensor of ROIAlignOp "
-                            "does not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-
-      auto interpolation_cords = get_indexes_and_ratios(
-          width, height, roi_width, roi_height, roi_xmin, roi_ymin,
-          pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h);
-
-      std::vector<T> interpolated_values;
-      interpolated_values.reserve(interpolation_cords.size());
-      for (auto channel = 0; channel < channels; ++channel) {
-        interpolate(interpolated_values, interpolation_cords, batch_data);
-        avg_pool(interpolated_values, output_data, roi_bin_grid_w,
-                 roi_bin_grid_h, pooled_width, pooled_height);
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        interpolated_values.clear();
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index d5b63854d99053ac0620a32cfaba267c7262d515..78509e4299b80ee44610ce3d10f9c57afa0cde18 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 09d2d906653e8c71ddeca7fa606cf5adac8cc596..13490d6fcde3a22e7299db21969d7de6f9a6582c 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
 template <typename DeviceContext, typename T>
 class XPUROIAlignOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index a512e7dcd682b517f64e3b14e2f35c4c539ec8b4..12e33d56c0020858ba44709572ee8e526bc949df 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input data should be a four-dimensional "
-                          "tensor with [N,C,H,W], but received input data with "
-                          " %d dimension",
-                          input_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...], but received ROIs is "
-            "%d-dimensional LoDTensor",
-            rois_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], kROISize,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
-            "the received data is %d",
-            rois_dims[1]));
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output height must be greater than 0"
-                          "but received height is %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output width must be greater than 0"
-                          "but received width is %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::OutOfRange(
-                          "The spatial scale must be greater than 0, "
-                          "but received spatial scale is %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Argmax", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -212,20 +147,15 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::RoiPoolInferMeta));
+
 REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
                   ops::ROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>,
+                  RoiPoolInferShapeFunctor);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_pool)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
deleted file mode 100644
index b907b1114bbc0402fb253ec00610abefe83051c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = phi::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = ctx.GetPlace();
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        int rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(cplace, rois_num_list.data(), gplace,
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
deleted file mode 100644
index a104fd49eb3e0b6d842ab6052e1181e6480a6f65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto argmax_stride = phi::stride(argmax->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument("The rois_num from input "
-                                            "and lod must be the same."));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = phi::stride(in->dims());
-      auto argmax_stride = phi::stride(argmax->dims());
-      auto roi_stride = phi::stride(rois->dims());
-      auto out_stride = phi::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index f82510556fde87fbf4aeb1904e29325358598791..898db4c22fed9cc97baa261b5b512a889290aff3 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/roll_op.h"
-
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
+
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                   ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                   ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(roll)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
deleted file mode 100644
index b9064c5450f9fbed64bcb65a2f9d15be2b56fbcf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roll_op.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
deleted file mode 100644
index 413c7bcfc15eb1cae86c3fedf47ea4f677d1248c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roll_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T>
-inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
-                            int64_t shift) {
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-  if (input_dim[dim] == 0) {
-    return;
-  }
-  shift = shift % input_dim[dim];
-  if (shift < 0) {
-    shift += input_dim[dim];
-  }
-
-  auto outer_loops = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_loops *= input_dim[i];
-  }
-  auto slice_width = 1;
-  for (auto i = dim + 1; i < input_dim.size(); i++) {
-    slice_width *= input_dim[i];
-  }
-
-  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
-          << "; dim: " << dim << "; shift: " << shift
-          << "; outer_loops: " << outer_loops
-          << "; slice_width: " << slice_width;
-  if (shift == 0) {
-    return;
-  }
-
-  std::vector<T> head;
-  auto head_size = slice_width * (input_dim[dim] - shift);
-  head.resize(head_size);
-
-  for (auto i = 0; i < outer_loops; i++) {
-    for (auto j = 0; j < head_size; j++) {
-      head[j] = data[i * input_dim[dim] * slice_width + j];
-    }
-    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
-      auto dst_pos = j - input_dim[dim] + shift;
-      for (auto k = 0; k < slice_width; k++) {
-        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
-            data[(i * input_dim[dim] + j) * slice_width + k];
-      }
-    }
-    for (auto j = 0; j < head_size; j++) {
-      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class RollKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar("X");
-    auto* output_var = context.OutputVar("Out");
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      PADDLE_ENFORCE_EQ(
-          dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(axis[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
-              i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
-      shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RollGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_var = context.OutputVar(framework::GradVarName("X"));
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index 815984ac307fdce14a64f01a661b4b7f7ce1d616..d5ef95269b48a1a7e7b9c3e75af4f9b595580ad3 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index bbd5b9c4e7db914d63c9c803c52d44f9350c1d41..3a6fdbaa2613d1f87a84f7175d7d5b507c3479ab 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,60 +23,6 @@ namespace operators {
 class SearchSortedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  static bool SearchsortedDimsMatchedBeforeLastDim(
-      const framework::DDim& sequences_dims,
-      const framework::DDim& values_dims) {
-    if (sequences_dims.size() != values_dims.size()) {
-      return false;
-    }
-    const auto& sequences_dims_size = sequences_dims.size();
-    for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
-      if (sequences_dims[dim] != values_dims[dim]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence",
-                   "searchsorted");
-    OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted");
-
-    auto sequences_dims = ctx->GetInputDim("SortedSequence");
-    auto values_dims = ctx->GetInputDim("Values");
-    auto out_int32 = ctx->Attrs().Get<bool>("out_int32");
-
-    if (sequences_dims.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims),
-          true,
-          platform::errors::Unavailable(
-              "The dimensions of sorted_sequence tensor ( %s ) and values "
-              "tensor ( %s ) can not match. Because the input sorted_sequence "
-              "tensor must be 1 dimension or the first N-1 dimensions of "
-              "sorted_sequence tensor and input values tensor must match. "
-              "Please input appropriate sorted_sequence and values again! ",
-              sequences_dims, values_dims));
-    }
-
-    if (out_int32) {
-      PADDLE_ENFORCE_LT(
-          sequences_dims[sequences_dims.size() - 1],
-          std::numeric_limits<int>::max(),
-          platform::errors::Unavailable(
-              "The size of sorted_sequence %d exceed the maximum limit d%. "
-              "Because the size of sorted_sequence should be less than the "
-              "output maximum value for int32 bit. Please set appropriate "
-              "sorted_sequence to meet this requirement! ",
-              sequences_dims[sequences_dims.size() - 1],
-              std::numeric_limits<int>::max()));
-    }
-
-    ctx->SetOutputDim("Out", values_dims);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -116,11 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    searchsorted,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor,
+                            PD_INFER_META(phi::SearchsortedInferMeta));
+REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker,
+                  SearchsortedInferShapeFunctor);
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 7d0d782b837c4c828996e993634373ab38d88eac..73655bcb18500e54564936eac4400a0c7b49af62 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/set_value_op.h"
+
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -34,6 +40,8 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type, const framework::VariableNameMap &inputs,
@@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(
-        in_dims.size(), 7,
-        platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.",
-            in_dims.size()));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -236,21 +233,16 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor,
+                            PD_INFER_META(phi::SetValueInferMeta));
+
 REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::framework::OpDesc>,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
-                  ops::SetValueOpInplaceInferer);
+                  ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor);
 
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
deleted file mode 100644
index 9f291a863c067ae0210f44befb89191678291441..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/set_value_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/set_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index a5ef7e8efbe7764a7d8292c07ad1047190500402..17b406916cf570b82f2c1c59879da4a0220c6f5f 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -19,7 +19,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
@@ -36,23 +35,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DDim = framework::DDim;
 
-inline void GetOffsets(const DDim& big_dim, const DDim& small_dim,
-                       DDim start_offset, int cur_dim,
-                       std::vector<DDim>* offsets) {
-  if (cur_dim == big_dim.size()) {
-    offsets->push_back(start_offset);
-    return;
-  }
-  if (small_dim[cur_dim] == big_dim[cur_dim]) {
-    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-  } else {
-    for (int i = 0; i < big_dim[cur_dim]; i++) {
-      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-      start_offset[cur_dim] += 1;
-    }
-  }
-}
-
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
   std::string value_name;
   switch (data_type) {
@@ -121,253 +103,6 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       "of target shape: %d, but now shape is %d.",
       second.to_str(), first.to_str()));
 }
-template <typename DeviceContext, typename T>
-class SetValueGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-
-    switch (rank) {
-      case 1:
-        SetValueGradCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueGradCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueGradCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueGradCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueGradCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueGradCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of set_value_grad's input should be less than 7, but "
-            "received %d.",
-            rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueGradCompute(const framework::ExecutionContext& context) const {
-    auto starts = context.Attr<std::vector<int64_t>>("starts");
-    auto ends = context.Attr<std::vector<int64_t>>("ends");
-    auto steps = context.Attr<std::vector<int64_t>>("steps");
-
-    auto axes_int64 = context.Attr<std::vector<int64_t>>("axes");
-    std::vector<int> axes(axes_int64.begin(), axes_int64.end());
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto reverse_axis = Eigen::array<bool, D>();
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-    auto list_new_steps_tensor =
-        context.MultiInput<framework::Tensor>("StepsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    }
-
-    if (list_new_steps_tensor.size() > 0) {
-      steps = GetDataFromTensorList<int64_t>(list_new_steps_tensor);
-    }
-
-    auto in = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        in->IsInitialized(), true,
-        platform::errors::PermissionDenied(
-            "The input of `set_value_grad`(%s) has not been initialized",
-            framework::GradVarName("Out")));
-    auto grad_value = context.Output<framework::Tensor>(
-        framework::GradVarName("ValueTensor"));
-    auto grad_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    auto in_dims = in->dims();
-
-    auto decrease_axis_int64 =
-        context.Attr<std::vector<int64_t>>("decrease_axes");
-    std::vector<int> decrease_axis(decrease_axis_int64.begin(),
-                                   decrease_axis_int64.end());
-    std::vector<int> infer_flags(axes.size(), 1);
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims,
-                        decrease_axis, out_dims_vector.data(), axes.size(),
-                        false);
-
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(),
-                        reverse_vector.data(), in_dims, infer_flags,
-                        decrease_axis, starts.size());
-
-    for (size_t axis = 0; axis < D; axis++) {
-      starts_indices[axis] = 0;
-      ends_indices[axis] = out_dims[axis];
-      steps_indices[axis] = 1;
-      reverse_axis[axis] = false;
-    }
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices[axis_index] = starts[axis];
-      ends_indices[axis_index] = ends[axis];
-      steps_indices[axis_index] = steps[axis];
-      reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
-    }
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-    if (grad_input) {
-      // Set gradient of `Input`
-      paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input);
-
-      auto grad_input_t =
-          framework::EigenTensor<T, D, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::From(*grad_input);
-
-      framework::Tensor tmp(grad_input->dtype());
-      tmp.mutable_data<T>(out_dims, context.GetPlace());
-      set_zero(dev_ctx, &tmp, static_cast<T>(0));
-      auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                          Eigen::DenseIndex>::From(tmp);
-
-      grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices)
-          .device(place) = tmp_t;
-    }
-    if (grad_value) {
-      grad_value->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, grad_value, static_cast<T>(0));
-
-      auto in_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                         Eigen::DenseIndex>::From(*in);
-
-      if (grad_value->dims() == out_dims) {
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor,
-                                   Eigen::DenseIndex>::From(*grad_value);
-        if (need_reverse) {
-          framework::Tensor tmp(grad_value->dtype());
-          tmp.mutable_data<T>(out_dims, context.GetPlace());
-          set_zero(dev_ctx, &tmp, static_cast<T>(0));
-          auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                              Eigen::DenseIndex>::From(tmp);
-
-          tmp_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-          grad_value_t.device(place) = tmp_t.reverse(reverse_axis);
-        } else {
-          grad_value_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-        }
-      } else {
-        int out_dims_size = out_dims.size();
-        auto grad_value_dims = grad_value->dims();
-        auto fake_grad_value_dims = out_dims;
-
-        // Create an extented shape according to the rules of broadcast.
-        auto grad_value_dims_size = grad_value_dims.size();
-
-        int num_decrease = 0;
-
-        int decrease_axis_size = decrease_axis.size();
-        for (int i = 0; i < out_dims_size; i++) {
-          if (decrease_axis.end() !=
-              std::find(decrease_axis.begin(), decrease_axis.end(), i)) {
-            fake_grad_value_dims[i] = 1;
-            num_decrease++;
-          } else if (i < out_dims_size - (grad_value_dims_size +
-                                          decrease_axis_size - num_decrease)) {
-            fake_grad_value_dims[i] = 1;
-          } else {
-            auto index_grad =
-                i - (out_dims_size - (grad_value_dims_size +
-                                      decrease_axis_size - num_decrease));
-            fake_grad_value_dims[i] = grad_value_dims[index_grad];
-
-            PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) ||
-                                  (grad_value_dims[index_grad] == 1),
-                              true,
-                              platform::errors::InvalidArgument(
-                                  "An error occurred while calculating %s: "
-                                  "[%s] can not be accumulated into [%s].",
-                                  framework::GradVarName("ValueTensor"),
-                                  out_dims, grad_value_dims));
-          }
-        }
-
-        VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor")
-                << "([" << grad_value_dims << "])is broadcasted into ["
-                << fake_grad_value_dims << "].";
-
-        auto extent = Eigen::DSizes<Eigen::DenseIndex, D>();
-        auto offset = out_dims;
-        for (int i = 0; i < out_dims_size; i++) {
-          offset[i] = 0;
-          extent[i] = fake_grad_value_dims[i];
-        }
-        std::vector<DDim> offsets;
-        GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets);
-
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::
-                From(*grad_value, fake_grad_value_dims);
-
-        framework::Tensor tmp(grad_value->dtype());
-        tmp.mutable_data<T>(out_dims, context.GetPlace());
-        set_zero(dev_ctx, &tmp, static_cast<T>(0));
-        auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                            Eigen::DenseIndex>::From(tmp);
-
-        tmp_t.device(place) =
-            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-
-        // accumulate gradient
-        for (auto offset : offsets) {
-          grad_value_t.device(place) =
-              grad_value_t +
-              tmp_t.slice(framework::EigenDim<D>::From(offset), extent);
-        }
-        if (need_reverse) {
-          framework::Tensor tmp_value(grad_value->dtype());
-          tmp_value.mutable_data<T>(fake_grad_value_dims, context.GetPlace());
-          auto tmp_value_t =
-              framework::EigenTensor<T, D, Eigen::RowMajor,
-                                     Eigen::DenseIndex>::From(tmp_value);
-          tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis);
-          grad_value_t.device(place) = tmp_value_t;
-        }
-      }
-    }
-  }
-};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index e2c8359beb1290f7b1b592c1ff24b15986f41f73..9001ce5d51dece5c6cee481f3f6f92e69c302c2b 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::InvalidArgument(
-                          "Input (Input) of get_shape op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of get_shape op should not be null."));
-    auto in_dim = ctx->GetInputDim("Input");
-    ctx->SetOutputDim("Out", {in_dim.size()});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
@@ -89,7 +81,12 @@ Return the shape of the input.
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor,
+                            PD_INFER_META(phi::ShapeInferMeta));
+
 REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShapeInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 374992096605bfef0433992193e54306c3a12858..3840b99dd176d5b348533f3e50f7f90fc3250ea1 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
@@ -23,6 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of SoftmaxOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of SoftmaxOp is not found."));
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto rank_x = dim_x.size();
-    auto axis = ctx->Attrs().Get<int>("axis");
-    PADDLE_ENFORCE_GE(axis, -rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-    PADDLE_ENFORCE_LT(axis, rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::InvalidArgument("Input(Out) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        platform::errors::InvalidArgument("Input(Out@GRAD) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        platform::errors::InvalidArgument("Input(Out) and its gradients "
-                                          "should have a same shape."));
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::SoftmaxInferMeta));
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpInferVarType,
                   ops::SoftmaxOpGradMaker<paddle::framework::OpDesc>,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
-                  ops::SoftmaxInplaceInferer);
-REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
+                  ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad,
+                  SoftmaxGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 19a395e72314db52d52cf704a567dce8dd58318a..41545a1ca20b267e79f43c2af4c58ea64dd479b2 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel(
 */
 template <typename T, typename LabelT>
 __global__ void SoftmaxWithCrossEntropyGradHardLabel(
-    T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n,
-    const int64_t dim, const int64_t d, const int ignore_index) {
+    T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels,
+    const int64_t n, const int64_t dim, const int64_t d,
+    const int ignore_index) {
   int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   int64_t idx_n = idx / (d * dim);
   int64_t idx_dim = (idx / d) % dim;
@@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(
     if (lbl == ignore_index) {
       logits_grad[idx] = static_cast<T>(0.0);
     } else if (lbl == idx_dim) {
-      logits_grad[idx] =
-          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
+      logits_grad[idx] = (softmax[idx] - static_cast<T>(1.0)) * loss_grad[ids];
     } else {
-      logits_grad[idx] *= loss_grad[ids];
+      logits_grad[idx] = softmax[idx] * loss_grad[ids];
     }
   }
 }
@@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
     const Tensor* softmax = context.Input<Tensor>("Softmax");
-    if (logit_grad != softmax) {
+    auto stream = context.cuda_device_context().stream();
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto use_softmax = context.Attr<bool>("use_softmax");
+
+    T* logit_grad_data = nullptr;
+    bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label));
+    if (copy_flag) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
+      logit_grad_data = logit_grad->template data<T>();
+    } else {
+      logit_grad_data =
+          logit_grad->template mutable_data<T>(context.GetPlace());
     }
-    T* logit_grad_data = logit_grad->template data<T>();
 
     const int rank = logit_grad->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 #else
     int block = 512;
 #endif
-    auto stream = context.cuda_device_context().stream();
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
     if (!use_softmax) {
@@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
+      const T* softmax_data = softmax->template data<T>();
       const auto* label_data = labels.template data<LabelT>();
       int grid = (n * d + block - 1) / block;
       SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
-          ignore_index);
+          logit_grad_data, loss_grad_data, softmax_data, label_data, n,
+          d / remain, remain, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index bff8061814ae66f243ca9d863cf866821ede4a32..aa944cfcfbb1713aeb27b501083853abb4ffed40 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -16,9 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel<T> {
       out_level.mutable_data<T>(output_shape, context.GetPlace());
       // pooling
       if (pooling_type == "max") {
-        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
-        math::MaxPool<T> max_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::MaxPool<T>, T>
+            pool_forward;
+        phi::funcs::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      max_process);
       } else if (pooling_type == "avg") {
-        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
-        math::AvgPool<T> avg_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPool<T>, T>
+            pool_forward;
+        phi::funcs::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      avg_process);
@@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
+    phi::funcs::SetConstant<
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+        zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
     auto out_stride = phi::stride(out->dims());
@@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel<T> {
       outgrad_level.Resize(out_shape);
       // pooling backward
       if (pooling_type == "max") {
-        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        phi::funcs::MaxPool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+            pool2d_backward;
         pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
                         *&out_level, *&outgrad_level, kernel_size, strides,
                         paddings, in_x_grad);
       } else if (pooling_type == "avg") {
-        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+        phi::funcs::Pool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPoolGrad<T>, T>
             pool_backward;
-        math::AvgPoolGrad<T> avg_process;
+        phi::funcs::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
                       paddings, true, false, in_x_grad, avg_process);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 58e5440689926497705624a0c64e6cc3d43dbab1..a776a78616b8d6dbac66ccab0d59433b98ae65e4 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
index d198992abde7dc79f0732928a3cb0cb0e6549ded..0c178b02d03099c6e9df4c3eae5ce95352982d7e 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::SyncBatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index d1add111e1d24cb711955a9aff06eb19feb35dc9..0a9ae789b01eea8a14952afe0d998005c59c0659 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of topk must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-
-    if (axis < 0) axis += dim_size;
-
-    int k;
-    auto k_is_tensor = ctx->HasInput("K");
-    if (k_is_tensor) {
-      k = -1;
-    } else {
-      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-      PADDLE_ENFORCE_EQ(k >= 1, true,
-                        paddle::platform::errors::InvalidArgument(
-                            "the attribute of k in the topk must >= 1 or be a "
-                            "Tensor, but received %d .",
-                            k));
-    }
-
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of topk must have >= 1d shape"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of topk op must have >= %d columns in axis of %d", k,
-              axis));
-    }
-
-    framework::DDim dims = input_dims;
-
-    dims[axis] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor,
+                            PD_INFER_META(phi::TopKInferMeta));
 REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>,
+                  TopKInferShapeFunctor);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 0590b66f6f868858d66e95382f96c8ad42ac64c2..c6c0fa3c0019eac742a9c70ea53a438f5a474895 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2.
 )DOC");
   }
 };
-class TraceOpGrad : public framework::OperatorWithKernel {
+class TraceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -114,7 +114,7 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
                   TraceInferShapeFunctor);
 
-REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
+REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index df84659a00f4c4220853404a8b28c6ccc93623a3..35b925ca172b7ccb665978010dbcdd2cb10c9678 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
 #include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
deleted file mode 100644
index fd46aca456cd9bd883cf9d1ce3576b307794b1a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext &context, const Tensor &x,
-                             const Tensor &y, Tensor *out, bool upper,
-                             bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen library
-  std::vector<int64_t> x_bst_dims_vec;
-  std::vector<int64_t> y_bst_dims_vec;
-  std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
-
-  Tensor x_bst(x.type());
-  TensorExpand<T, DeviceContext>(context, x, &x_bst, x_bst_dims_vec);
-
-  Tensor y_bst(y.type());
-  TensorExpand<T, DeviceContext>(context, y, &y_bst, y_bst_dims_vec);
-
-  // TriangularSolveFunctor performs calculations in-place
-  // x_clone should be a copy of 'x' after broadcast
-  // out should be a copy of 'y' after broadcast
-  Tensor x_clone(x.type());
-  x_clone.Resize(phi::make_ddim(x_bst_dims_vec));
-  x_clone.mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone);
-
-  out->Resize(phi::make_ddim(y_bst_dims_vec));
-  out->mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(y_bst, context.GetPlace(), context, out);
-
-  math::TriangularSolveFunctor<DeviceContext, T> functor;
-  functor(context, &x_clone, out, /*left=*/true, upper, transpose,
-          unitriangular);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index 3e943c62e1ce17857e78e140efeb50e627e80a4e..c8010e8a128e0b2483c93ed38047b17060bfb0e9 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                   ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
                   ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu
deleted file mode 100644
index 9cbbdeeb2ce28453f2c22d063975fa82aae5d3b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tril_triu_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h
deleted file mode 100644
index 3150b7617d10a8f9c2f60dd2e74ab2cbbb2d655e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tril_triu_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TrilTriuCompute {
- public:
-  HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
-                             const int64_t H, const int64_t W, T* out)
-      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    const int64_t row = (idx / W_) % H_;
-    const int64_t col = idx % W_;
-    const bool mask =
-        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
-    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
-  }
-
- private:
-  const T* in_;
-  const int diagonal_;
-  const bool lower_;
-  const int64_t H_;
-  const int64_t W_;
-  T* out_;
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = x->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_computer(
-        x_data, diagonal, lower, H, W, out_data);
-    for_range(tril_triu_computer);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = d_out->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_out->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
-        dout_data, diagonal, lower, H, W, dx_data);
-    for_range(tril_triu_grad_computer);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index ad1c1814c05cdf7f96a6f3c05a5cf1a00d2a2e93..4145730357d6007368d26c46d2b1bd47c9085982 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
index e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb..a44ea8ff689b85d9f718572c45b4f8fafaf1565d 100644
--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index 7f676cbb65ee460cdf639641330d49b5774f95a5..f6112fb59c12252255861825ff9d7b534c542665 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index c5dff84723ccf4f40065f5a1d13cf5cdce8b3a0f..ce9b09f60ca352a0cc33d2e477134ca2e10c2ad2 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -56,6 +56,9 @@ XPUOpMap& get_kp_ops() {
       {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_sigmoid",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"thresholded_relu",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ce2dba4db02a00398965f0ed656fa64e4fc828df..4001fd744e67784d736cb157743e4b4d7fa4517e 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
   }
 }
 
+inline void RegisterModelLayout(
+    std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
+    const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
+                           const std::string& attrib_name) -> bool {
+      if (op->HasAttr(attrib_name)) {
+        auto data_format = op->Attr<std::string>(attrib_name);
+        platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+            data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC
+                                             : framework::DataLayout::kNCHW);
+        return true;
+      } else {
+        return false;
+      }
+    };
+
+    for (auto& op : ops) {
+      if (check_attrib(op, std::string("data_format"))) {
+        return;
+      }
+      if (check_attrib(op, std::string("data_layout"))) {
+        return;
+      }
+    }
+  }
+}
+
 inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
   return (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8" ||
           op->GetAttrIfExists<bool>("use_quantizer"));
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 01de7349f4823a66b2d180f3d1493477f361273a..1254331835bbdf4dfc698021a52208d846651dd5 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -618,7 +618,7 @@ class BinaryMKLDNNHandler
                       const dnnl::engine engine, platform::Place cpu_place,
                       const Tensor* x, const Tensor* y, Tensor* z,
                       float scale_x, float scale_y, float scale_z,
-                      const dnnl::post_ops& post_ops = dnnl::post_ops())
+                      const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
@@ -676,8 +676,8 @@ class BinaryMKLDNNHandler
     const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
 
-    auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
-    attributes.set_post_ops(post_ops);
+    auto attributes =
+        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -690,10 +690,9 @@ class BinaryMKLDNNHandler
   }
 
  private:
-  static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op,
-                                                      float scale_x,
-                                                      float scale_y,
-                                                      float scale_z) {
+  static inline dnnl::primitive_attr CreateAttributes(
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
     // output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
@@ -718,6 +717,7 @@ class BinaryMKLDNNHandler
                           {scale_0});
     attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0,
                           {scale_1});
+    if (post_ops.len() > 0) attributes.set_post_ops(post_ops);
     return attributes;
   }
 };
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index feb72bce72bf8c9c13260d53d65020a68ba85eb8..940fc98d3b32021ae8b278305c54d8819292daaf 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -77,7 +77,9 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
 #endif
 #endif
   if (FLAGS_enable_host_event_recorder_hook == false) {
-    OriginalConstruct(name, role, "none");
+    if (g_state != ProfilerState::kDisabled) {  // avoid temp string
+      OriginalConstruct(name, role, "none");
+    }
     return;
   }
   if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
@@ -165,8 +167,8 @@ void RecordEvent::End() {
   }
 #endif
 #endif
-  uint64_t end_ns = PosixInNsec();
   if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
+    uint64_t end_ns = PosixInNsec();
     if (LIKELY(shallow_copy_name_ != nullptr)) {
       HostEventRecorder::GetInstance().RecordEvent(
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
@@ -190,6 +192,7 @@ void RecordEvent::End() {
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
+    uint64_t end_ns = PosixInNsec();
     tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
                           g_thread_id);
   }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6c8fc450cd496ce2fc2eb42f9a2b95a1272715ff..7b223f7ed27e2249d84539a81312658f8c2260f0 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -44,6 +44,9 @@ endif()
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
+  if (WITH_GPU)
+    set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator)
+  endif()
   if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -92,7 +95,7 @@ if(NOT ON_INFER)
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
-  if(WITH_ASCEND)
+  if(WITH_ASCEND_CL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
@@ -348,7 +351,7 @@ if(WITH_PYTHON)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python custom_operator custom_operator_node)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
index 7a276df0d5bdc95b6c925b7c620d7931b6aaf0ec..3b898ce77ce6fb43ca9aaba38e5db9e01a1d19d3 100644
--- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -31,7 +31,6 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
 
     tstate = PyEval_SaveThread();
     run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
-    std::cout << "end run_program_dygraph_function" << std::endl;
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
   } catch (...) {
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 0b1796703817c28526172a542ae9253578f44ee2..e89d8d96342e723724bb867a14bc4262c6ab7b16 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -51,6 +51,18 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+std::shared_ptr<distributed::EagerReducer> CreateEagerReducer(
+    py::handle py_tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters) {
+  auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+  return std::make_shared<distributed::EagerReducer>(
+      params, group_indices, is_sparse_gradient, process_group,
+      group_size_limits, find_unused_parameters);
+}
+
 #if defined(PADDLE_WITH_GLOO)
 using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
 using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
@@ -223,25 +235,13 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_GLOO)
-  py::class_<GlooOptions>(*m, "GlooOptions")
-      .def(py::init<>())
-      .def_readwrite("_device", &GlooOptions::device)
-      .def_static("create", &GlooOptions::create);
-
-  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
-      .def(py::init(
-               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
-                 return std::make_shared<GlooStore>(store);
-               }),
-           py::call_guard<py::gil_scoped_release>());
-
   py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
       *m, "ProcessGroupGloo", ProcessGroup)
-      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
-                    std::shared_ptr<GlooOptions> &>(),
+      .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
+                    int, std::shared_ptr<GlooOptions> &>(),
            py::call_guard<py::gil_scoped_release>())
-      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
-                       int world_size) {
+      .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
+                       int rank, int world_size) {
              auto opts = GlooOptions::create();
              char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
              if (ifname && strlen(ifname) > 1) {
@@ -271,6 +271,17 @@ void BindDistributed(py::module *m) {
          py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
          py::arg("tensor_indices") = std::vector<int64_t>{},
          py::call_guard<py::gil_scoped_release>());
+
+  py::class_<distributed::EagerReducer,
+             std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
+                                                         R"DOC()DOC")
+      .def(py::init(&CreateEagerReducer))
+      .def("prepare_for_backward",
+           [](distributed::EagerReducer &self, py::handle py_tensors) {
+             auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+             self.PrepareForBackward(params);
+           },
+           py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 0b04dc7347ce78f87d6f8d81e30eb4135fd965ed..528bd75eb0013b95057d7549e083b2fa1318cac1 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -21,21 +21,25 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -118,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   EAGER_TRY
   auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
-  egr::RunBackward(tensors, grad_tensors,
-                   CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  egr::Backward(tensors, grad_tensors,
+                CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
+                                            PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
+  auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
+  auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
+  auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
+  auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
+  auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
+
+  std::vector<paddle::experimental::Tensor> result =
+      egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
+                only_inputs, allow_unused, no_grad_vars);
+  VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
+  return ToPyObject(result, true /* return_py_none_if_not_initialize */);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
                                        PyObject* kwargs) {
   EAGER_TRY
@@ -168,7 +192,284 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+    return;
+  } else {
+    VLOG(7) << "Construct CustomEdgesSlotMap ";
+    auto inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[0]);
+    auto outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[0]);
+    auto attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[0]);
+    auto grad_outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+    auto grad_inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+    auto grad_attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+    std::vector<std::unordered_map<int, int>> res(5);
+    in_out_map.insert({op_type, res});
+    // Prepare pos map for grad_outputs
+    VLOG(7) << "Prepare pos map for grad_outputs";
+    PADDLE_ENFORCE_LE(
+        grad_outputs_names.size(), inputs_names.size(),
+        paddle::platform::errors::InvalidArgument(
+            "Grad outputs num should be less equal than forward inputs num."));
+    for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+      size_t end = grad_outputs_names[i].find("@GRAD");
+      PADDLE_ENFORCE_NE(
+          end, std::string::npos,
+          paddle::platform::errors::NotFound(
+              "All Grad outputs should be grad and we got %s is not grad var, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " inputs: " << inputs_names[j] << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][0][j] = i;
+        }
+      }
+    }
+    // Prepare pos map for grad_inputs
+    for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+      size_t end = grad_inputs_names[i].find("@GRAD");
+      if (end != std::string::npos) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i].substr(0, end) == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                    << " outputs: " << outputs_names[j] << " related to No."
+                    << i << " grad_inputs's grad: " << grad_inputs_names[i];
+            in_out_map[op_type][1][j] = i;
+          }
+        }
+      } else {
+        if (std::find(outputs_names.begin(), outputs_names.end(),
+                      grad_inputs_names[i]) != outputs_names.end()) {
+          for (size_t j = 0; j < outputs_names.size(); j++) {
+            if (grad_inputs_names[i] == outputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " outputs: " << outputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+              in_out_map[op_type][2][j] = i;
+            }
+          }
+        } else {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_inputs_names[i] == inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " inputs: " << inputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+              in_out_map[op_type][3][j] = i;
+            }
+          }
+        }
+      }
+    }
+
+    // Prepare pos map for grad attrs_
+    for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+      auto end = std::find(attrs_names.begin(), attrs_names.end(),
+                           grad_attrs_names[i]);
+      PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                        paddle::platform::errors::NotFound(
+                            "All Grad attrs should be one of forward attrs and "
+                            "we got %s is not one of them, please check your "
+                            "op and change to fit the rule.",
+                            grad_attrs_names[i]));
+      for (size_t j = 0; j < attrs_names.size(); j++) {
+        if (grad_attrs_names[i] == attrs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " attrs: " << attrs_names[j] << " related to No." << i
+                  << " grad_attrs: " << grad_attrs_names[i];
+          in_out_map[op_type][4][j] = i;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<paddle::any> CastAttrsToTragetType(
+    const std::vector<paddle::any>& src,
+    const std::vector<std::string>& attrs_names) {
+  std::vector<paddle::any> res;
+  PADDLE_ENFORCE_EQ(src.size(), attrs_names.size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "We Expected same size of attrs and attrs_name list, "
+                        "if u got this error indicate your custom op setting "
+                        "%s attrs, but you just give %s",
+                        attrs_names.size(), src.size()));
+  for (size_t i = 0; i < src.size(); i++) {
+    size_t end = attrs_names[i].find(": ");
+    std::string type_name =
+        attrs_names[i].substr(end + 2, attrs_names.size() - end - 2);
+    if (type_name == "int") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32, other type is "
+            "forbidden for now but we got %s. Check your code first please",
+            i, src[i].type().name()));
+      }
+    } else if (type_name == "int64_t") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
+      } else if (src[i].type() == typeid(int64_t)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32 or int64_t, "
+            "other type is forbidden for now but we got %s. Check your code "
+            "first please",
+            i, src[i].type().name()));
+      }
+    } else {
+      res.emplace_back(src[i]);
+    }
+  }
+  return res;
+}
+
+static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  paddle::CustomOpKernelContext ctx =
+      CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0);
+  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
+  VLOG(7) << "Get things for python for Custom Op: " << op_type
+          << ", trace_backward is: " << trace_backward;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  PADDLE_ENFORCE_NE(meta_info_map.find(op_type), meta_info_map.end(),
+                    paddle::platform::errors::NotFound(
+                        "Can't find %s in Eager OpMetaInfoMap which should be "
+                        "created by LoadOpMetaInfoAndRegisterOp, please make "
+                        "sure you registered your op first and try again. ",
+                        op_type));
+  VLOG(7) << "Run Kernel of Custom Op: " << op_type;
+  std::vector<paddle::any> res_attrs = CastAttrsToTragetType(
+      ctx.Attrs(), paddle::framework::OpMetaInfoHelper::GetAttrs(
+                       meta_info_map.at(op_type)[0]));
+  ctx.EmplaceBackAttrs(res_attrs);
+  const auto& vec_map = meta_info_map.at(op_type);
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+  if (require_any_grad) {
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type;
+    ConstructFwdAndBwdMap(vec_map, op_type);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
+      if (slot_map[0].find(i) != slot_map[0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+        grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        grad_node->AddEdges(&ins_auto_grad_metas[i],
+                            ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type)[1]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = res_attrs[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_functions[] = {
+    // TODO(jiabin): Remove scale when we have final state tests
     {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_expected_place",
@@ -179,6 +480,11 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_partial_grad",
+     (PyCFunction)(void (*)(void))eager_api_run_partial_grad,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"read_next_tensor_list",
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e5f22338dc61543a377d4a94307f834b774257d4..49745e5679d9af3c8cf07ba0cac679217a691052 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -36,6 +36,8 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 
 namespace paddle {
 namespace pybind {
@@ -214,8 +216,8 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
-  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
-  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
       self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
@@ -226,6 +228,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
+                                   PyObject* kwargs) {
+  EAGER_TRY
+  auto cp_tensor =
+      self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)
+      ->SetPersistable(
+          egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
+  return ToPyObject(cp_tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
                                                  PyObject* args,
                                                  PyObject* kwargs) {
@@ -264,7 +279,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
   }
 
-  self->tensor.copy_(src_tensor, blocking);
+  self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
@@ -314,23 +329,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     grad = meta->MutableGrad();
   }
 
-  if (grad->is_selected_rows()) {
-    auto selected_rows =
-        std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
-    if (selected_rows->mutable_value()->IsInitialized()) {
-      selected_rows->mutable_rows()->clear();
-      selected_rows->mutable_value()->clear();
-    }
-  } else if (grad->is_dense_tensor()) {
-    if (grad->initialized()) {
-      if (set_to_zero) {
-        grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
-      } else {
-        VLOG(4) << "Gradient of " << self->tensor.name()
-                << " is initialized, will be released.";
-        auto dense_tensor =
-            std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
-        dense_tensor->MoveMemoryHolder();
+  if (grad->impl()) {
+    if (grad->is_selected_rows()) {
+      auto selected_rows =
+          std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
+      if (selected_rows->mutable_value()->IsInitialized()) {
+        selected_rows->mutable_rows()->clear();
+        selected_rows->mutable_value()->clear();
+      }
+    } else if (grad->is_dense_tensor()) {
+      if (grad->initialized()) {
+        if (set_to_zero) {
+          grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+        } else {
+          VLOG(4) << "Gradient of " << self->tensor.name()
+                  << " is initialized, will be released.";
+          auto dense_tensor =
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
+          dense_tensor->MoveMemoryHolder();
+        }
       }
     }
   }
@@ -688,6 +705,122 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* set_grad_type(TensorObject* self, PyObject* args,
+                               PyObject* kwargs) {
+  EAGER_TRY
+  auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensor =
+      egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    grad_tensor.set_impl(std::make_shared<phi::DenseTensor>());
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    grad_tensor.set_impl(std::make_shared<phi::SelectedRows>());
+  }
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_coo_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCooTensor"));
+  auto sparse_coo_tensor =
+      std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+      sparse_coo_tensor->non_zero_indices()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_elements(TensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(
+      self->tensor.is_sparse_coo_tensor() ||
+          self->tensor.is_sparse_csr_tensor(),
+      paddle::platform::errors::Fatal("this method is only effective for "
+                                      "SparseCooTensor or SparseCsrTensor"));
+  if (self->tensor.is_sparse_coo_tensor()) {
+    auto sparse_coo_tensor =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_coo_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  } else {
+    auto sparse_csr_tensor =
+        std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_csr_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_crows(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_crows()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_cols(TensorObject* self,
+                                                 PyObject* args,
+                                                 PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_cols()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor() ||
+                    self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_coo(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  uint32_t inplace_version = self->tensor.current_inplace_version();
+
+  return ToPyObject(inplace_version);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -734,6 +867,30 @@ PyMethodDef variable_methods[] = {
     {"_register_backward_hook",
      (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"non_zero_indices",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_elements",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_elements,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_crows",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_crows,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_cols",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_cols,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse", (PyCFunction)(void (*)(void))tensor_method_is_sparse,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_is_sparse_coo,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 102cdbb91ab066c4a6d499688bca30c1c3d185ad..685e20aef2591492340d228f0a48d7a426ddb889 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) {
 
 std::string GenerateOpFunctionsBody(
     const paddle::framework::proto::OpProto* op_proto, std::string func_name,
-    bool use_inplace_strategy = false,
     std::map<std::string, std::string> inplace_map = {}) {
   auto& op_type = op_proto->type();
   std::string input_args = "";
-  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string call_api_str = "";
   std::string ins_initializer_with_null = "";
   std::string py_arg = "";
   int arg_idx = 0;
   int input_args_num = 0;
   std::string ins_cast_str = "";
   std::string view_strategy_str = "";
+  if (!inplace_map.empty()) {
+    // change call_api_str for inplace op
+    call_api_str = "auto out = " + op_type + "__dygraph_function(";
+  } else {
+    call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  }
   for (auto& input : op_proto->inputs()) {
     auto& in_name = input.name();
     // skip those dispensable inputs, like ResidualData in conv2d
@@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody(
         HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
         viwe_input_name, viwe_output_name);
   }
-
-  return_str = "return ToPyObject(out);";
+  if (!inplace_map.empty()) {
+    // For inplace op, Use the input PyObject directly.
+    for (auto& inplace_pair : inplace_map) {
+      // Find index of inplace tensor, and directly use input PyObject.
+      std::string inplace_arg_name = inplace_pair.second;
+      std::string inplace_return_name = inplace_pair.first;
+      const char* RETURN_INPLACE_TENSOR_TEMPLATE =
+          "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "\"%s\", \"%s\");\n"
+          "    ssize_t return_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "    return ToPyObject(out, return_id, args, arg_id);";
+      return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
+                                           op_type, inplace_arg_name, op_type,
+                                           inplace_return_name);
+      // only support one inplace_var in temporary.
+      PADDLE_ENFORCE_EQ(
+          inplace_map.size(), 1,
+          paddle::platform::errors::InvalidArgument(
+              "size of inplace_map must be 1, but got %d", inplace_map.size()));
+      break;
+    }
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
 
   std::string function_args = "";
   if (input_args == "") {
@@ -383,7 +411,8 @@ GenerateOpFunctions() {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
-    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    std::string op_function_str =
+        GenerateOpFunctionsBody(op_proto, func_name, {});
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
@@ -391,6 +420,40 @@ GenerateOpFunctions() {
 
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
+
+    // NOTE(pangyoki): Inplace Strategy.
+    // In this case, output will reuse input varbase.
+    // Dygraph mode needs to be aligned with the in-place strategy in static
+    // mode, and the mapping relationships between output and input that have
+    // been defined in static mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static mode, and get the
+    // mapping relationship between Inplace output and input.
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      // Inplace OP: op_type_.
+      // The inplace OP needs a new implementation method.
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      std::string inplace_op_type = op_type + "_";
+      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_op_function_str =
+          GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
+
+      // generate pybind item
+      auto inplace_bind_function_str =
+          paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type,
+                                  inplace_func_name, inplace_op_type);
+
+      op_function_list.emplace_back(std::move(inplace_op_function_str));
+      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
+    }
   }
   if (append_custom_head_file) {
     op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2e1390cb96155c4832a8ceace889e331039ed43f..ff8980d727e70a41223878f22f019353f8b71972 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -72,7 +72,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
   VLOG(6) << "Get grad for tensor: " << self->tensor.name();
   auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
-  if (meta) {
+  if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
     Py_INCREF(Py_None);
@@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
                      "Detected NULL grad"
                      "Please check if you have manually cleared"
                      "the grad inside autograd_meta"));
-  grad->copy_(src, true);
+  grad->copy_(src, self->tensor.inner_place(), true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index f4e148cf8dceb5211c368fa00211b2c7b9f0a725..a23bb1230e128657e0bd416d7e1875997e6cf6e8 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -27,10 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -46,6 +46,7 @@ extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
+extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
   switch (dtype) {
@@ -184,7 +185,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "Tensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -319,7 +320,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "DenseTensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -391,6 +392,19 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
   return dtype;
 }
 
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
 PyObject* ToPyObject(bool value) {
   if (value) {
     Py_INCREF(Py_True);
@@ -403,6 +417,8 @@ PyObject* ToPyObject(bool value) {
 
 PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
 
+PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); }
+
 PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
 
 PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
@@ -428,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) {
   return obj;
 }
 
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // value: Useless parameter.
+  // value_idx: Useless parameter.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  Py_INCREF(obj);
+  return obj;
+}
+
 PyObject* ToPyObject(const std::vector<bool>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
@@ -478,20 +508,26 @@ PyObject* ToPyObject(const std::vector<double>& value) {
   return result;
 }
 
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
   for (size_t i = 0; i < value.size(); i++) {
-    PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
-    if (obj) {
-      auto v = reinterpret_cast<TensorObject*>(obj);
-      new (&(v->tensor)) paddle::experimental::Tensor();
-      v->tensor = value[i];
+    if (!value[i].initialized() && return_py_none_if_not_initialize) {
+      Py_INCREF(Py_None);
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "tp_alloc return null, can not new a PyObject."));
+      PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+      if (obj) {
+        auto v = reinterpret_cast<TensorObject*>(obj);
+        new (&(v->tensor)) paddle::experimental::Tensor();
+        v->tensor = value[i];
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "tp_alloc return null, can not new a PyObject."));
+      }
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
     }
-    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
   }
 
   return result;
@@ -928,6 +964,5 @@ paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
   framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
   return framework::TransToPhiDataType(type);
 }
-
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 966a920377b38f160a1d4789ef4b04b61d47f2c1..fba1485bcf44ea70db286225fbbe3c70caceb4bd 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
+class CustomOpKernelContext;
 namespace framework {
 class Scope;
 }
-
 namespace pybind {
 
 typedef struct {
@@ -40,6 +40,8 @@ int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
 int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
 std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
                                                        ssize_t arg_pos);
@@ -52,7 +54,9 @@ std::vector<framework::LoDTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
+
 PyObject* ToPyObject(int value);
+PyObject* ToPyObject(uint32_t value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
 PyObject* ToPyObject(float value);
@@ -60,12 +64,15 @@ PyObject* ToPyObject(double value);
 PyObject* ToPyObject(const char* value);
 PyObject* ToPyObject(const std::string& value);
 PyObject* ToPyObject(const paddle::experimental::Tensor& value);
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx);
 PyObject* ToPyObject(const std::vector<bool>& value);
 PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
 PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize = false);
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
@@ -80,6 +87,17 @@ struct TupleTensorResult {
     TupleTensorResult<Tuple, N - 1>::Run(out, result);
     PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
+    if (N - 1 == value_idx) {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
+                                                 value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+    }
+  }
 };
 
 template <typename Tuple>
@@ -87,6 +105,16 @@ struct TupleTensorResult<Tuple, 1> {
   static void Run(const Tuple& out, PyObject* result) {
     PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    if (value_idx == 0) {
+      PyTuple_SET_ITEM(result, 0,
+                       ToPyObject(std::get<0>(out), value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+    }
+  }
 };
 
 template <typename... Args>
@@ -99,6 +127,26 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out, ssize_t value_idx,
+                     PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // out: Outputs tuple after executing op.
+  // value_idx: Index of inplace tensor in outputs tuple. Used to find the
+  // output inplace tensor.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result, value_idx,
+                                                         args, arg_idx);
+
+  return result;
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos);
@@ -138,6 +186,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     ssize_t arg_idx, bool dispensable = false);
 
 // end of Slice related methods
+
 std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable);
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 362a3e44fab6254bef591bfd144e071821846271..4f25a6f1a5ca8d1a7926d148830934370e323e0f 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
-
+#include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
 
@@ -122,6 +122,8 @@ void ThrowExceptionToPython(std::exception_ptr p) {
         PyErr_SetString(EnforceNotMetException, e.what());
         break;
     }
+  } catch (const paddle::PD_Exception& e) {
+    PyErr_SetString(PyExc_OSError, e.what());
   }
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 3145a9cf7655c053c269990e00982226eae49c7a..01dae420cc6ab84edc0b0df11b0b4cf6408a87f7 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) {
       .def("stop_server", &GraphPyClient::stop_server)
       .def("get_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names) {
              auto feats =
                  self.get_node_feat(node_type, node_ids, feature_names);
@@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) {
            })
       .def("set_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names,
               std::vector<std::vector<py::bytes>> bytes_feats) {
              std::vector<std::vector<std::string>> feats(bytes_feats.size());
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 9b373a58181f165b52fe809b817977a076b7d961..3a2c93309f34454ae0ce2d3419e3fce474f7c06b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -52,11 +52,13 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
+#include "paddle/phi/core/compat/type_defs.h"
 
 namespace paddle {
 namespace pybind {
@@ -117,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
       return var;
     }
 
-    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+    auto res_varbase = PyObjectCast<std::shared_ptr<imperative::VarBase>>(res);
+    // Here the reference count of `res` is 2, so we decreases the reference
+    // count manually to avoid memory leaks
+    Py_DECREF(res);
+    return res_varbase->SharedVar();
   }
 
  private:
@@ -436,6 +442,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
+paddle::imperative::NameTensorMap ConvertToNameTensorMap(
+    const PyNameVarBaseMap &map) {
+  paddle::imperative::NameTensorMap result;
+  for (auto &pair : map) {
+    auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0);
+    if (!var_vec.empty()) {
+      // change vector<Tensor> -> vector<shared_ptr<Tensor>>
+      std::vector<std::shared_ptr<egr::EagerVariable>> dst_var_vec;
+      for (auto &v : var_vec) {
+        dst_var_vec.emplace_back(
+            std::make_shared<egr::EagerVariable>(std::move(v)));
+      }
+      result.emplace(pair.first, std::move(dst_var_vec));
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      PyErr_Occurred(), nullptr,
+      platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred()))));
+  return result;
+}
+
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
@@ -2079,8 +2107,8 @@ void BindImperative(py::module *m_ptr) {
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs) {
              // TODO(xiongkun): move this function outside of tracer.
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
+             auto ins_map = ConvertToNameTensorMap(ins);
+             auto outs_map = ConvertToNameTensorMap(outs);
              {
                auto to_vector = [](paddle::SmallVector<std::string> &vec) {
                  return std::vector<std::string>(vec.begin(), vec.end());
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b008308e27d9afaa9d8c47290489d50a762f2a41..c8f0acd0b8a853f541a6fb8cbafe73f27688c71a 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
       .def("params_file", &AnalysisConfig::params_file)
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
+           py::arg("gpu_fp16_disabled_op_types") =
+               std::unordered_set<std::string>({}))
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
            py::arg("l3_workspace_size") = 16 * 1024 * 1024,
            py::arg("locked") = false, py::arg("autotune") = true,
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index bb45c1c40603f953c70f0e63b6e762037312e8c3..ecbacd37d5666b85d5ddaef595d106e2400b055c 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
       .def("var", &Node::Var, return_value_policy::reference)
       .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
+      .def("graph_id", &Node::GraphId)
       .def("original_desc_id", &Node::OriginalDescId)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8d78adaf5a4735d87e2206df6c8b55875db68118..1520174fba288b4ecf683e79c36b6c0228237b2e 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -46,10 +46,19 @@ int main(int argc, char **argv) {
   auto &kernel_factory = phi::KernelFactory::Instance();
   std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
-    if (kernel_signature_map.Has(op_kernel_pair.first)) {
+    std::string op_name = op_kernel_pair.first;
+    const paddle::flat_hash_map<std::string, std::string> &kernel_name_map =
+        phi::OpUtilsMap::Instance().base_kernel_name_map();
+    for (auto &it : kernel_name_map) {
+      if (it.second == op_name) {
+        op_name = it.first;
+        break;
+      }
+    }
+    if (kernel_signature_map.Has(op_name)) {
       kernel_signature_map_str =
           kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
-      auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
+      auto &args = kernel_signature_map.Get(op_name).args;
 
       kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 09c3cea398b2aec4d7cf0953ffb0aed75de37601..1d483abd7746c104c3f1dcf318f45850e4fcb855 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() {
   }
 }
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name) {
+  // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`.
+  // `core_ops_args_info`: get index from core_ops_args_info[op_type] according
+  // to input name.
+  // `core_ops_returns_info`: get index from core_ops_returns_info[op_type]
+  // according to return name.
+  if (!core_ops_info_map.count(op_type)) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Op %s is not found in core_ops_*_info map.", op_type));
+  } else {
+    auto args_list = core_ops_info_map.at(op_type);
+    auto it = std::find(args_list.begin(), args_list.end(), name);
+    if (it == args_list.end()) {
+      PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.",
+                                           name, op_type));
+    } else {
+      return std::distance(args_list.begin(), it);
+    }
+  }
+  return -1;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 7ead9852667252d189b1fcdecc6b4ac7b86d785f..33d0e242a027d250904a21ca36a39b6a639178e1 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs(  // NOLINT
 
 void InitOpsAttrTypeMap();
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 9e86e3df8a6884ec1b75b8525ad858ff8f2e233c..d8750c1d6c115a6de8a493cac4ccadbd47bc10fd 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -88,6 +88,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nce",
      {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
       "CustomDistAlias", "CustomDistAliasProbs"}},
+    {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1c5b30fe087f3636a6a10579651d2c6a77a42343..ed42d0792eafbc8661883a7e8d5b396fac14686f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -64,6 +64,9 @@ limitations under the License. */
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
@@ -111,6 +114,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
@@ -161,6 +165,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_py.h"
 #endif
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "pybind11/stl.h"
 
 DECLARE_bool(use_mkldnn);
@@ -184,6 +191,7 @@ PyTypeObject *g_cudapinnedplace_pytype = nullptr;
 PyTypeObject *g_mluplace_pytype = nullptr;
 PyTypeObject *g_framework_tensor_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
+PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -729,6 +737,18 @@ PYBIND11_MODULE(core_noavx, m) {
                lib[string]: the libarary, could be 'phi', 'fluid' and 'all'.
            )DOC");
 
+  // NOTE(Aganlengzi): KernelFactory static instance is initialized BEFORE
+  // plugins are loaded for custom kernels, but de-initialized AFTER they are
+  // unloaded. We need manually clear symbols(may contain plugins' symbols)
+  // stored in this static instance to avoid illegal memory access.
+  m.def("clear_kernel_factory",
+        []() { phi::KernelFactory::Instance().kernels().clear(); });
+  m.def("clear_device_manager", []() {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    phi::DeviceManager::Clear();
+#endif
+  });
+
   // NOTE(zjl): ctest would load environment variables at the beginning even
   // though we have not `import paddle.fluid as fluid`. So we add this API
   // to enable eager deletion mode in unittest.
@@ -747,6 +767,57 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
+  py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
+      m, "CustomOpKernelContext", R"DOC()DOC");
+  g_custom_op_kernel_ctx_pytype =
+      reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
+  custom_op_kernel_ctx.def(py::init<>())
+      .def("add_inputs",
+           [](paddle::CustomOpKernelContext &self, const py::handle &input) {
+             PyObject *obj = input.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackInputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_outputs",
+           [](paddle::CustomOpKernelContext &self, py::handle &outputs) {
+             PyObject *obj = outputs.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackOutputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          bool attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          float attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int64_t attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          const std::vector<std::string> &attr) {
+        self.EmplaceBackAttr(attr);
+      });
+
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
   g_framework_tensor_pytype =
@@ -1180,6 +1251,287 @@ PYBIND11_MODULE(core_noavx, m) {
            });
 #else
            })
+#ifdef PADDLE_WITH_CUDA
+      .def("_share_buffer_with",
+           [](framework::Tensor &self, const framework::Tensor src,
+              py::tuple t) {
+             auto *cuda_ipc_allocation =
+                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
+                     src.Holder().get());
+
+             PADDLE_ENFORCE_NOT_NULL(
+                 cuda_ipc_allocation,
+                 platform::errors::PreconditionNotMet(
+                     "Tensor is not Cuda IPC shared tensor. "
+                     "Now only Tensor shared by cuda ipc could use this "
+                     "api."));
+
+             size_t size = t[0].cast<size_t>();
+             auto dtype =
+                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
+             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto lod_info = t[3].cast<framework::LoD>();
+             auto device_id = t[4].cast<int>();
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::Allocation>(
+                     cuda_ipc_allocation->ptr(),
+                     cuda_ipc_allocation->base_ptr(), size,
+                     platform::CUDAPlace(device_id));
+
+             self.ResetHolderWithType(shared_reader_holder, dtype);
+             self.Resize(dims);
+             self.set_lod(lod_info);
+
+             VLOG(6) << "Reconstructed tensor with buffer shared!";
+           },
+           R"DOC(
+           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
+
+           Params:
+               tensor: Shared Cuda IPC tensor.
+               tuple: contrains data size, data type,
+                      tensor dims, lod information, device index.
+
+       )DOC")
+      .def("_share_cuda",
+           [](framework::Tensor self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0.  could not pass "
+                   "to shared memory. ");
+
+             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
+                 self.Holder().get());
+             PADDLE_ENFORCE_EQ(
+                 platform::is_gpu_place(holder->place()), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor is not on GPU. share_cuda only support GPU "
+                     "Tensor, share_filename is for CPU tensor."));
+
+             void *base_ptr = holder->base_ptr();
+             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
+                                      reinterpret_cast<char *>(base_ptr);
+
+             cudaIpcMemHandle_t handle;
+             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
+
+             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
+                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
+
+             // TODO(ZHUI): use cuda event, to avoid sync.
+             const auto &device_id = paddle::platform::GetCurrentDeviceId();
+             auto stream =
+                 paddle::platform::stream::get_current_stream(device_id);
+             stream->Synchronize();
+
+             int type_idx = static_cast<int>(self.type());
+             size_t data_size =
+                 self.numel() *
+                 framework::SizeOfType(
+                     framework::TransToProtoVarType(self.type()));
+
+             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
+                                   type_idx, vectorize(self.dims()), self.lod(),
+                                   device_id);
+           },
+           R"DOC(
+           Serialize GPU Tensor by cudaIpcMemHandle.
+
+           Returns:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+
+      )DOC")
+      .def("_new_shared_cuda",
+           [](py::tuple t) {
+             if (t.size() != 7)
+               throw std::runtime_error(
+                   "Invalid Tensor meta info for shared cuda tensor!");
+
+             // 1. Create a new C++ instance
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation from handle
+             const std::string &handle = t[0].cast<std::string>();
+             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
+             auto device_id = t[6].cast<int>();
+             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
+             size_t size = t[2].cast<size_t>();
+             void *dev = base_ptr.get();
+             dev = reinterpret_cast<char *>(dev) + offset_bytes;
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::CudaIpcAllocation>(
+                     dev, size, device_id, std::move(base_ptr));
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_reader_holder,
+                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize GPU lod tensor from cudaIpcMemHandle.
+
+           Params:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
+
+        )DOC")
+#endif
+      .def("_share_filename",
+           [](framework::Tensor &self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0. could not pass to "
+                   "shared memory. ");
+
+             auto holder = self.Holder();
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(holder->place()) ||
+                     platform::is_cuda_pinned_place(holder->place()),
+                 true, platform::errors::InvalidArgument(
+                           "Tensor is not on CPU. share_filename only "
+                           "support CPU Tensor."));
+
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 holder.get());
+             // If the tensor is not shared, allocate memory map allocation.
+             if (mmap_allocation == nullptr) {
+               void *data_ptr = self.data();
+               size_t data_size =
+                   self.numel() *
+                   framework::SizeOfType(
+                       framework::TransToProtoVarType(self.type()));
+
+               int flags = memory::allocation::MAPPED_SHAREDMEM |
+                           memory::allocation::MAPPED_EXCLUSIVE;
+               std::string handle = memory::allocation::GetIPCName();
+               auto shared_holder =
+                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                       handle, flags, data_size);
+
+               // copy data & reset holder
+               if (platform::is_cuda_pinned_place(holder->place())) {
+#ifdef PADDLE_WITH_CUDA
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CUDAPinnedPlace(), data_ptr, data_size);
+#endif
+               } else {
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CPUPlace(), data_ptr, data_size);
+               }
+               self.ResetHolder(shared_holder);
+               mmap_allocation = shared_holder.get();
+             }
+             int type_idx = static_cast<int>(self.type());
+
+             return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->size(), type_idx,
+                                   vectorize(self.dims()), self.lod());
+           },
+           R"DOC(
+           Serialize CPU lod tensor in shared memory to tuple.
+           If the tensor is not in shared memory, we will copy it first.
+
+           Returns:
+               tuple: contrains ipc name, data size, data type,
+                      tensor dims and lod imformation.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+
+       )DOC")
+      .def("_new_shared_filename",
+           [](py::tuple t) {  // __setstate__
+             if (t.size() != 5)
+               throw std::runtime_error("Invalid Tensor meta info state!");
+
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation
+             const std::string &ipc_name = t[0].cast<std::string>();
+             size_t size = t[1].cast<size_t>();
+             int flags = memory::allocation::MAPPED_SHAREDMEM |
+                         memory::allocation::MAPPED_NOCREATE;
+
+             auto shared_holder =
+                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                     ipc_name, flags, size);
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_holder,
+                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.set_lod(t[4].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize CPU lod tensor from shared memory.
+
+           Params:
+               tuple: contrains ipc file name, data size, data type,
+                      tensor dims and lod information.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
+
+        )DOC")
+      .def("_shared_incref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->incref();
+             }
+           },
+           R"DOC(
+            Increase reference count of share_filename tensor.
+      )DOC")
+      .def("_shared_decref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->decref();
+             }
+           },
+           R"DOC(
+            Decrease reference count of share_filename tensor.
+      )DOC")
       .def(py::pickle(
           [](const framework::Tensor &t) {  // __getstate__
             auto holder = t.Holder();
@@ -2536,10 +2888,11 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_meta_info_and_register_op",
-        framework::LoadOpMetaInfoAndRegisterOp);
+  m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
+    egr::Controller::Instance().MergeOpMetaInfoMap(
+        framework::LoadOpMetaInfoAndRegisterOp(dso_name));
+  });
   m.def("init_devices", []() { framework::InitDevices(); });
-
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index a037fa13eb53b94fd8d82413dad55d7f34b0006d..add332abd30eaaad1772a0b8e326ea0ae6c27e8b 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -188,16 +188,14 @@ static void ParseIndexingSlice(
       int start = static_cast<int>(PyLong_AsLong(slice_item));
       auto s_t = start;
       start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
+
+      PADDLE_ENFORCE(
+          0 <= start && start < dim_len,
+          platform::errors::OutOfRange("The starting index %d of slice is out "
+                                       "of bounds in tensor %d-th axis, it "
+                                       "shound be in the range of [%d, %d).",
+                                       s_t, dim, -dim_len, dim_len));
+
       slice_axes->push_back(dim);
       slice_starts->push_back(start);
       slice_ends->push_back(start + 1);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index c593c7df3e0ec708beecfd6c5051637d65a7f79d..6849fcb039410f95d829b9bb793a856f1485bd6c 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -585,14 +585,20 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
   auto &step = *pstep;
   auto &slicelength = *pslicelength;
   const framework::DDim &srcDDim = self.dims();
-  if (dim < 0 || dim >= srcDDim.size()) {
-    throw py::index_error();
-  }
+  PADDLE_ENFORCE(
+      0 <= dim && dim < srcDDim.size(),
+      platform::errors::OutOfRange("The dim %d of slice is out of bounds, it "
+                                   "shound be in the range of [0, %d).",
+                                   dim, srcDDim.size()));
+
   if (py::isinstance<py::slice>(obj)) {
     size_t lstart, lstop, lstep, lslicelength;
     py::slice s = static_cast<py::slice>(obj);
     if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
-      throw py::index_error();
+      PADDLE_THROW(platform::errors::OutOfRange(
+          "Slice on dim: %d is error, please check the validity of tensor "
+          "dims or slice item.",
+          dim));
     }
     start = static_cast<int64_t>(lstart);
     stop = static_cast<int64_t>(lstop);
@@ -600,15 +606,19 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
     slicelength = static_cast<int64_t>(lslicelength);
   } else if (py::isinstance<py::int_>(obj)) {
     start = static_cast<int64_t>(static_cast<py::int_>(obj));
-    if (std::abs(start) >= srcDDim[dim]) {
-      throw py::index_error();
-    }
+    PADDLE_ENFORCE(
+        std::abs(start) < srcDDim[dim],
+        platform::errors::OutOfRange("The start %d of slice is out of bounds, "
+                                     "it shound be in the range of (%d, %d).",
+                                     start, -srcDDim[dim], srcDDim[dim]));
     start = (start >= 0) ? start : srcDDim[dim] - start;
     stop = start + 1;
     step = 1;
     slicelength = 1;
   } else {
-    throw py::index_error();
+    PADDLE_THROW(
+        platform::errors::OutOfRange("Index object error, the index object for "
+                                     "slice only supports slice(::) and int."));
   }
 }
 
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index ed29b5b44c7791d356ec1283a0027cacf1fd5e7a..e777a8e3ab4e6a59662ce7b4eb9a31a7409d6f56 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
 endif()
 
 option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
+option(INFRT_WITH_GPU  "Compile INFRT with GPU"    OFF)
+option(INFRT_WITH_TRT  "Compile INFRT with TensorRT"    OFF)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
 if (INFRT_WITH_PHI)
-    add_definitions("-DINFRT_WITH_PHI")
+  add_definitions("-DINFRT_WITH_PHI")
+
+  # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
+  if (INFRT_WITH_GPU)
+    add_definitions("-DINFRT_WITH_GPU")
+    if (INFRT_WITH_TRT)
+      add_definitions("-DINFRT_WITH_TRT")
+    endif()
+  endif()
 endif()
 
 # compile flags
@@ -90,10 +100,8 @@ add_subdirectory(tests)
 set(infrt_mlir_incs
         basic_kernels_inc
         test_kernels_inc
-        infrt_base_inc
         tensor_shape_inc
         dense_tensor_inc
-        pd_ops_inc
         pd_extra_ops_inc
         trt_ops_inc
         )
@@ -107,6 +115,9 @@ if (INFRT_WITH_PHI)
 endif()
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+if (INFRT_WITH_TRT)
+  target_link_libraries(infrt infrt_trt)
+endif()
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index e0488117783d5657aa97c301d9d12ce1c77017e7..0500a8123044cd05695c5167b1afaa48a6027b57 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -24,7 +24,7 @@
 
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -144,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
     // process results
     auto& last_op = predict_func.front().back();
-    if (last_op.getName().getStringRef() == "Infrt.return") {
+    if (last_op.getName().getStringRef() == "infrt.return") {
       for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
         auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
         results_.push_back(ValueRef(value));
diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
index c8f97e04a1b8376efbac749fffa70d77c7b95e72..6e3bef9299162d493825f49e3962c75f2845e2d0 100644
--- a/paddle/infrt/backends/host/phi_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -13,6 +13,10 @@ limitations under the License. */
 
 #include "paddle/phi/core/allocator.h"
 
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
 namespace infrt {
 namespace backends {
 
@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
   }
 };
 
+#ifdef INFRT_WITH_GPU
+// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
+class GpuPhiAllocator : public phi::Allocator {
+ public:
+  static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
+
+  AllocationPtr Allocate(size_t bytes_size) {
+    void* ptr;
+    cudaMalloc(&ptr, bytes_size);
+    return AllocationPtr(
+        new phi::Allocation(
+            ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
+        deleter);
+  }
+};
+#endif
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262..bcd63dbb39fe8c52499138423bc9b86fa5de9d57 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace infrt {
 namespace backends {
@@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext {
   std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
+class GpuPhiContext : public phi::GPUContext {
+ public:
+  using Base = phi::GPUContext;
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+};
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907..89dd3b0dc7abf48102b48f16fb974b3c902fe049 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -37,9 +37,9 @@ namespace infrt {
 namespace backends {
 namespace tensorrt {
 
-const char* model_input = "model_input";
-const char* model_output = "model_output1";
-const char* model_output2 = "model_output2";
+const char* model_input = "input_0";
+const char* model_output = "output_0";
+const char* model_output2 = "output_1";
 
 TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
     nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
@@ -82,9 +82,176 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
   return network;
 }
 
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructFCNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 7840;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i % 255 * 0.02f;
+  }
+  kernel_weights.values = weight_data.data();
+  auto* layer = network->addFullyConnected(
+      *data, 10, kernel_weights, nvinfer1::Weights{});
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructConvNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights, bias_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  bias_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 81;
+  bias_weights.count = 3;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i * 0.02f;
+  }
+  std::vector<float> bias_data(bias_weights.count);
+  for (size_t i = 0; i < bias_data.size(); ++i) {
+    bias_data[i] = i * 0.5f;
+  }
+  kernel_weights.values = weight_data.data();
+  bias_weights.values = bias_data.data();
+  nvinfer1::Dims ksize;
+  ksize.nbDims = 2;
+  ksize.d[0] = 3;
+  ksize.d[1] = 3;
+  auto* layer =
+      network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights);
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
 // sigmoid(x) = 1 / (1 + exp(-x))
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
+TEST(trt, run_fc_static) {
+  TrtEngine engine(0);
+  auto net = ConstructFCNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 1, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 1 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
+TEST(trt, run_conv_static) {
+  TrtEngine engine(0);
+  auto net = ConstructConvNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
 TEST(trt, run_static) {
   TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
@@ -122,27 +289,26 @@ TEST(trt, run_static) {
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  phi::DenseTensor output, output2;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.PrepareOutputHandle("output_0");
+  static_trt_engine.PrepareOutputHandle("output_1");
+  static_trt_engine.SetUpInference(inference_options, inputs);
   static_trt_engine.GetEngineInfo();
   static_trt_engine.Run(context);
 
+  phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
   std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
@@ -208,27 +374,27 @@ TEST(trt, run_dynamic) {
                        context.stream());
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.PrepareOutputHandle("output_0");
+  engine.PrepareOutputHandle("output_1");
+  engine.SetUpInference(inference_options, inputs);
   engine.GetEngineInfo();
   engine.Run(context);
+  phi::DenseTensor* output0 = engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = engine.GetOutput("output_1");
 
   std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index 232653e8c41f71fd9bb32c9eac302b047d122b66..43d356b6d6983afdca220029d34d9d5cd27da009 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace backends {
@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
+void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
+  phi::DenseTensor t;
+  outputs_.emplace(out_name, t);
+}
+
+phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
+  return &outputs_[name];
+}
+
+size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
+
 bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
-    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
   // TODO(wilber): now only create one exec_context
   FreshDeviceId();
   CHECK(engine_ != nullptr);
@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
     bindings_.front()->AddBinding(
         bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
   }
-  for (auto& it : *outputs) {
+  for (auto& it : outputs_) {
     const int bind_index = engine_->getBindingIndex(it.first.c_str());
     bindings_.front()->AddBinding(
-        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+        bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
   }
 
   return true;
@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
     const int bind_index = engine_->getBindingIndex(bind.name.c_str());
     std::vector<int32_t> ddim;
     auto dims = engine_->getBindingDimensions(bind_index);
+    CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
     ddim.push_back(runtime_batch);
     for (int i = 0; i < dims.nbDims; ++i) {
       ddim.push_back(dims.d[i]);
     }
     bind.buffer->Resize(phi::make_ddim(ddim));
+    // TODO(wilber): now only support float output.
     ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
     buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
   }
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 3c8243e3c3838e30eb70877f8c82d623c103eaff..a26474f8cbb357d42cd6d951829bbdc24a256640 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -81,11 +81,17 @@ class TrtEngine {
   // TODO(wilber): How to support multiple execution contexts?
   bool SetUpInference(
       const InferenceOptions& inference,
-      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
 
   void GetEngineInfo();
 
+  void PrepareOutputHandle(const std::string& out_name);
+
+  // TODO(wilber): The output tensor names are: output_0, output_1, ...
+  phi::DenseTensor* GetOutput(const std::string&);
+
+  size_t GetOutputNum() const;
+
  private:
   void FreshDeviceId();
 
@@ -112,6 +118,7 @@ class TrtEngine {
   std::vector<std::unique_ptr<Bindings>> bindings_;
   int device_id_{0};
   bool is_dynamic_shape_{false};
+  std::unordered_map<std::string, phi::DenseTensor> outputs_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index e35989da2085b21f4dbfaadea05793fc9dcb8753..cf3906c32e559d9fa33d0583be9adb1b2591e78b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,27 +2,15 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    basic_kernels.cc
-    test_kernels.cc
-    infrt_base.cc
-    init_infrt_dialects.cc
+    init_dialects.cc
     tensor_shape.cc
     dense_tensor.cc
     mlir_loader.cc
     diagnostic_utils.cc
-    pd_types.cc
-    pd_ops.cc
     )
 
-mlir_tablegen_on(basic_kernels)
-mlir_tablegen_on(test_kernels)
-mlir_tablegen_on(infrt_base DIALECT Infrt)
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
-mlir_tablegen_on(pd_op_base DIALECT pd)
-mlir_tablegen_on(pd_ops)
-mlir_tablegen_on(pd_extra_ops)
-mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
@@ -30,10 +18,10 @@ target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
-add_dependencies(print-ir pd_ops_inc)
 cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
 add_subdirectory(infrt)
+add_subdirectory(pd)
 add_subdirectory(tensorrt)
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 27febffe8156379c63a0b6b3fb048f7441255f0e..7fbd1e8a4efe1e9dc1d022beb7673ee8a59c7e36 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,7 +19,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index f5db90648eec9933eadf897a8090260bdbfe575b..59df4e9697370e9d8db4bbc0a5d69e8ef03950a5 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define DT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 }
 
 def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
-  let summary = "ddt.tensor_map_get_size operation";
+  let summary = "dt.tensor_map_get_size operation";
 
   let description = [{
     An operation that get the size of a TensorMap.
@@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
 
+def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_tensor operation";
+
+  let description = [{
+    An operation that can get a tensor from a TensorList.
+  }];
+
+  let arguments = (ins
+          DenseTensorList:$l,
+          I32Attr:$id
+          );
+  let results = (outs DenseTensor:$output);
+  let verifier = ?;
+}
+
+def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_size operation";
+
+  let description = [{
+    An operation that get the size of a TensorList.
+  }];
+
+  let arguments = (ins DenseTensorList:$map);
+  let results = (outs I32:$size);
+}
+
 def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
   let summary = "dt.get_tensor_shape operation";
 
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index 08ce2d4707bfdc8498610793437675ae8238475e..5f65336453fbdf82f30948aeea8dc52b0367159b 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -1,17 +1,3 @@
-core_gather_headers()
-
-gather_srcs(infrt_src SRCS
-    common_type.cc
-    infrt_dialect.cc
-    )
-
-
-add_mlir_dialect(infrt_ops infrt)
-
-set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
-mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
-mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
-add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
-add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
-
+add_subdirectory(common)
+add_subdirectory(ir)
 add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f693c82b5060ef35eecbc1ef9ad5053d6b93e4ad
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    types.cc
+    utils.cc
+    )
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common/types.cc
similarity index 87%
rename from paddle/infrt/dialect/infrt/common_type.cc
rename to paddle/infrt/dialect/infrt/common/types.cc
index 00684c505268c09e97d262a3526c946d1bc3095c..c10679b01342f03b35e816bf290f71790f541ee2 100644
--- a/paddle/infrt/dialect/infrt/common_type.cc
+++ b/paddle/infrt/dialect/infrt/common/types.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
@@ -30,6 +30,8 @@ llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key) {
     return LayoutType::NCHW;
   else if (key.equals_insensitive("NHWC"))
     return LayoutType::NHWC;
+  else if (key.equals_insensitive("ANY"))
+    return LayoutType::ANY;
   else
     return llvm::None;
 }
@@ -39,6 +41,8 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return PrecisionType::FLOAT32;
   else if (key.equals_insensitive("FP16"))
     return PrecisionType::FLOAT16;
+  else if (key.equals_insensitive("UNK"))
+    return PrecisionType::UNK;
   else
     return llvm::None;
 }
@@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) {
     case (LayoutType::NHWC):
       str = "NHWC";
       break;
+    case (LayoutType::ANY):
+      str = "ANY";
+      break;
     default:
       str = "Unsupported";
   }
@@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) {
     case (PrecisionType::FLOAT16):
       str = "FP16";
       break;
+    case (PrecisionType::UNK):
+      str = "UNK";
+      break;
     default:
       str = "Unsupported";
   }
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common/types.h
similarity index 100%
rename from paddle/infrt/dialect/infrt/common_type.h
rename to paddle/infrt/dialect/infrt/common/types.h
diff --git a/paddle/infrt/dialect/infrt/common/utils.cc b/paddle/infrt/dialect/infrt/common/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ffb23c490f8f52044d35d20508f42f3f9a89413
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/utils.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/common/utils.h"
+
+mlir::SmallVector<mlir::Value, 4> infrt::cvtValueToValueRange(
+    const mlir::Value &operand) {
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
+}
+
+mlir::SmallVector<mlir::Value, 4> infrt::concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
+  mlir::SmallVector<mlir::Value, 4> operands;
+  operands.append(operand_0.begin(), operand_0.end());
+  operands.append(operand_1.begin(), operand_1.end());
+  return operands;
+}
diff --git a/paddle/fluid/operators/searchsorted_op.cu b/paddle/infrt/dialect/infrt/common/utils.h
similarity index 57%
rename from paddle/fluid/operators/searchsorted_op.cu
rename to paddle/infrt/dialect/infrt/common/utils.h
index 4633ab43efba121cf4c55a877d90b974690952ec..886407b56649a296046d570826cf2b1b0e8aade8 100644
--- a/paddle/fluid/operators/searchsorted_op.cu
+++ b/paddle/infrt/dialect/infrt/common/utils.h
@@ -12,12 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    searchsorted, ops::SearchSortedKernel<plat::CUDADeviceContext, float>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, double>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int64_t>);
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+
+namespace infrt {
+
+mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
+    const mlir::Value &operand);
+
+mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1);
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c009bdb267e6ea1dd5a5fb392f64dddb7a05f06
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
@@ -0,0 +1,18 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_dialect.cc
+    basic_kernels.cc
+    test_kernels.cc
+    )
+
+add_mlir_dialect(infrt_ops infrt)
+
+set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
+mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
+mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
+add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
+add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+mlir_tablegen_on(basic_kernels)
+mlir_tablegen_on(test_kernels)
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
similarity index 63%
rename from paddle/infrt/dialect/basic_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.cc
index c1aa75fb24650b99ea8371c0ecbe7e572df2f0ce..ba83f3e36c94a173accad9fb6e746eaec0ec8e6c 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
@@ -30,23 +30,6 @@ namespace infrt {
 namespace dialect {
 using namespace mlir;  // NOLINT
 
-static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
-                               OperationState &result) {  // NOLINT
-  SymbolRefAttr callee_attr;
-  FunctionType callee_type;
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  auto callee_loc = parser.getNameLoc();
-  if (parser.parseAttribute(callee_attr, "callee", result.attributes) ||
-      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(callee_type) ||
-      parser.addTypesToList(callee_type.getResults(), result.types) ||
-      parser.resolveOperands(
-          operands, callee_type.getInputs(), callee_loc, result.operands))
-    return failure();
-  return success();
-}
-
 static ParseResult parseConstantOp(Type attrType,
                                    OpAsmParser &parser,       // NOLINT
                                    OperationState &result) {  // NOLINT
@@ -79,24 +62,6 @@ static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
       IntegerType::get(result.getContext(), 64), parser, result);
 }
 
-static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
-                                 OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands));
-}
-
-static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << op->getAttr("callee") << "(";
-  p.printOperands(op.getOperands());
-  p << ")";
-  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
-  p << " : ";
-}
-
 static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
   p << " ";
   p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
@@ -127,37 +92,13 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
   printConstant(p, op);
 }
 
-static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
-  if (op.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(op.getOperands());
-    p << " : ";
-    llvm::interleaveComma(op.getOperands(), p);
-  }
-}
-
-static LogicalResult verify(CallOp op) { return success(); }
-
 static LogicalResult verify(ConstantF32Op op) { return success(); }
 static LogicalResult verify(ConstantI32Op op) { return success(); }
 static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
-static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op->getParentOp());
-
-  if (!function) return success();
-
-  auto results = function.getType().getResults();
-  if (op.getNumOperands() != results.size())
-    return op.emitOpError("has ")
-           << op.getNumOperands()
-           << " operands, but enclosing function returns " << results.size();
-
-  return success();
-}
 }  // namespace dialect
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/basic_kernels.h
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.h
index b82abcd52d28f45b18824d9ea6f9e12c2ec1c574..a36f55691b716dda51120e8c4be7c956df9b9f25 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
@@ -18,4 +18,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
similarity index 69%
rename from paddle/infrt/dialect/basic_kernels.td
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.td
index 89d8cd65b85cd39c9eb50edca1aa1bfaf47073a4..60315b45dd0dfaee8437c1dd312691445fdede56 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
@@ -4,10 +4,10 @@
 #else
 #define BASIC_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op needs to provide all of a printer, parser and verifier.
   let printer = [{ return infrt::dialect::print(p, *this); }];
@@ -15,23 +15,6 @@ class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, m
   let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
 }
 
-def CallOp : INFRT_Op<"call"> {
-  let summary = "call a host operation";
-  let description = [{
-      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
-
-          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
-    }];
-
-  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
-  let results = (outs Variadic<AnyType>);
-
-  let extraClassDeclaration = [{
-      mlir::StringRef getCallee() { return callee(); }
-      mlir::FunctionType getCalleeType();
-    }];
-}
-
 class ConstantOp<string suffix, Type baseType, Attr attr>
     : INFRT_Op<"constant." # suffix, [NoSideEffect]> {
   let summary = "constant value constructor in host";
@@ -45,22 +28,6 @@ def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>;
 def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>;
 def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
 
-def ReturnOp : INFRT_Op<"return", [Terminator]> {
-  let summary = "host executor return operation";
-  let description = [{
-      The "Infrt.return" operation represents a return operation within a function.
-
-        func @foo() : (i32, f8) {
-        Infrt.return %0, %1 : i32, f8
-        }
-    }];
-
-  let arguments = (ins Variadic<AnyType>:$operands);
-
-  let builders = [OpBuilder<(ins),
-                  [{ build($_builder, $_state, llvm::None); }]>];
-}
-
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
   let summary = "infrt.add operation";
   let description = [{
@@ -112,7 +79,7 @@ def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
 def PrintStringOp : INFRT_Op<"print_string"> {
-  let summary = "Infrt.print_string";
+  let summary = "infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td
similarity index 81%
rename from paddle/infrt/dialect/infrt/infrt_ops_base.td
rename to paddle/infrt/dialect/infrt/ir/infrt_base.td
index 3190c1c84b8c04ceb7e91d829865c65503f5d708..86cfc375330b19878528645a2e810efb797e153f 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td
@@ -89,6 +89,13 @@ def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
   let parameters = (ins);
 }
 
+// TODO(wilber): Add !infrt.vec type.
+def DenseTensorList :  Infrt_Type<"DenseTensorList"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
 // Type Constrait for concrete DenseTensor type.
 class DenseTensor<string target, string precision, string layout> :
     Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
@@ -101,4 +108,21 @@ class Infrt_Attr<string name, list<Trait> traits = [],
     : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
   let mnemonic = ?;
 }
+
+// tools function. used for pattern rewriter
+class INFRT_createI32Attr<string value> : NativeCodeCall<
+    "$_builder.getI32IntegerAttr(" # value # ")">;
+
+class INFRT_createSI32Attr<string value> : NativeCodeCall<
+    "$_builder.getSI32IntegerAttr(" # value # ")">;
+
+class INFRT_createF32Attr<string value> : NativeCodeCall<
+    "$_builder.getF32FloatAttr(" # value # ")">;
+
+def INFRT_cvtValueToValueRange : NativeCodeCall<
+    "infrt::cvtValueToValueRange($0)">;
+
+def INFRT_concatTwoValueRange : NativeCodeCall<
+    "infrt::concatTwoValueRange($0, $1)">;
+
 #endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
similarity index 76%
rename from paddle/infrt/dialect/infrt/infrt_dialect.cc
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index 400e4921c944491e0ce8cded38fec9435f4ad0bd..f8d8f514749f802299600acac60b12de70a8d3fe 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -12,40 +12,52 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectImplementation.h>
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 namespace infrt {
 
 void InfrtDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"  // NOLINT
       >();
 
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"  // NOLINT
       >();
 
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"  // NOLINT
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
       >();
 }
 
@@ -78,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor_map") {
+    return DenseTensorMapType::get(parser.getContext());
+  }
   if (keyword == "dense_tensor") {
     // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
     llvm::StringRef target;
@@ -122,13 +137,18 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return DenseTensorType::get(
         parser.getContext(), *targetType, *precisionType, *layoutType);
   }
+
+  if (keyword == "tensor_list") {
+    return infrt::DenseTensorListType::get(parser.getContext());
+  }
+
   // Todo: parse other type
   return mlir::Type();
 }
 
 void InfrtDialect::printType(::mlir::Type type,
                              ::mlir::DialectAsmPrinter &os) const {
-  // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5>
+  // print LoDTensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5>
   if (type.isa<infrt::LoDTensorType>()) {
     auto lod_tensor_type = type.cast<infrt::LoDTensorType>();
     os << "lod_tensor<";
@@ -142,9 +162,13 @@ void InfrtDialect::printType(::mlir::Type type,
        << lod_tensor_type.getLod_level() << ">";
     return;
   }
+  if (type.isa<infrt::DenseTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
 
   // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
-  if (type.isa<infrt::DenseTensorType>()) {
+  if (type.isa<DenseTensorType>()) {
     auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
     os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
        << dense_tensor_type.getPrecision() << ", "
@@ -152,6 +176,16 @@ void InfrtDialect::printType(::mlir::Type type,
     return;
   }
 
+  if (type.isa<infrt::DenseTensorListType>()) {
+    os << "tensor_list";
+    return;
+  }
+  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
+  if (type.isa<DenseTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
+
   llvm_unreachable("unknown infrt type.");
 }
 
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
similarity index 77%
rename from paddle/infrt/dialect/infrt/infrt_dialect.h
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.h
index ed5b36e556149dbc3026e732cf953c5562841921..3e6ea2a74c79d43015a62f166928e10adb48698a 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
@@ -22,14 +22,14 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.h.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
similarity index 57%
rename from paddle/infrt/dialect/infrt/infrt_ops.td
rename to paddle/infrt/dialect/infrt/ir/infrt_ops.td
index 16ade66d47b8ee538a6e7c4f19bf571a25c3e416..82eba2a1746cce31e3fe99ae71c782bb88524930 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
@@ -1,4 +1,4 @@
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
 // Op definition
 class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
@@ -33,9 +33,29 @@ def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
   let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
-def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
-  let summary = "convert tensor type op";
-  let description = [{convert tensor type op!}];
+def Infrt_CallOp : Infrt_Op<"call"> {
+  let summary = "call a host operation";
+  let description = [{
+      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
+
+          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
+    }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  //let extraClassDeclaration = [{
+  //    mlir::StringRef getCallee() { return callee(); }
+  //    mlir::FunctionType getCalleeType();
+  //  }];
+  let assemblyFormat = [{
+    $callee `(` $operands `)` attr-dict `:` functional-type($operands, results)
+  }];
+}
+
+def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> {
+  let summary = "cast tensor type op";
+  let description = [{cast tensor type op!}];
   let arguments = (ins AnyType:$input);
   let results = (outs AnyType:$output);
 }
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
similarity index 96%
rename from paddle/infrt/dialect/test_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/test_kernels.cc
index f0c4723b49a7906cf5327771e26eb87e8b1248c0..5f7f83a9dfa8011b3043e20da7d9f21f3afe5cf6 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/test_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/OpDefinition.h>
@@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
-  if (last_op.getName().getStringRef() != "Infrt.return") {
+  if (last_op.getName().getStringRef() != "infrt.return") {
     return op.emitOpError("missing return statement");
   }
   if (last_op.getNumOperands() != 1) {
@@ -161,4 +161,4 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/infrt/ir/test_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/test_kernels.h
rename to paddle/infrt/dialect/infrt/ir/test_kernels.h
index 73c8a6fb387bca6ebc7ae393e4bba32ab94aa951..1fe5020b240046f71571e3a4c999b1eae07741a1 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.h
@@ -17,4 +17,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/infrt/ir/test_kernels.td
similarity index 93%
rename from paddle/infrt/dialect/test_kernels.td
rename to paddle/infrt/dialect/infrt/ir/test_kernels.td
index 6e4bc26aa1496dcb4caed83f98fc42dab9e3cce0..0ce1f3f65e8f7f46cf32794b3191e66ae71e3eae 100644
--- a/paddle/infrt/dialect/test_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.td
@@ -4,12 +4,12 @@
 #else
 #define TEST_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 // Base class for Test dialect ops.
 class Test_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+    Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op in the Test namespace needs to provide all of a printer,
   // parser and verifier.
@@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> {
          // The following code benchmarks the infrt.add.i32 kernel.
          %x = infrt.add.i32 %c, %c
          // The benchmarked function needs to return exactly one value.
-         Infrt.return %x : i32
+         infrt.return %x : i32
        }
   }];
 
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
index ef702650b6f1bbd3615ca7a70880d3c2c04e254b..3d825a9c762f4833e577125d20423a5f1d41737f 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -2,20 +2,20 @@
 #define INFRT_OP_FUSE
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 
-def FuseCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
-       (Infrt_CvtTensorOp $arg)>;
+def FuseTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
+       (Infrt_TensorCastOp $arg)>;
 
-def FuseFeedCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+def FuseFeedTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (PD_FeedOp $name)),
        (PD_FeedOp $name)>;
 
 def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
-def RedundantCvtTensorOptPattern : Pat<
-  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+def RedundantTensorCastOptPattern : Pat<
+  (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg),
   [(TypesAreIdentical $res, $arg)]>;
 
 
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index cb16e054418b3b2c6ff843fdaf464d24a42249c2..eec0e0bc7c5ab624e9db7744c357b58ff5107eef 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -15,8 +15,8 @@
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
 
@@ -27,8 +27,12 @@ struct InfrtOpFusePass
     : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "infrt-op-fuse"; }
+
   void runOnFunction() override;
 };
+
 // Implementation of the InfrtOpFusePass.
 void InfrtOpFusePass::runOnFunction() {
   ::mlir::RewritePatternSet patterns(&getContext());
@@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() {
   if (nullptr == terminator_op) return;
   for (auto operand : terminator_op->getOperands()) {
     auto *op1 = operand.getDefiningOp();
-    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1);
     if (!cvt_op) continue;
     mlir::Value value = cvt_op.input();
     operand.replaceAllUsesWith(value);
     cvt_op.erase();
   }
 }
+
 }  // namespace
+
 std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
   return std::make_unique<InfrtOpFusePass>();
 }
+
+mlir::PassRegistration<InfrtOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
deleted file mode 100644
index e951762abb20c232232af66d6bf1f2e7568a763b..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/infrt_base.h"
-
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/test_kernels.h"
-
-namespace infrt {
-namespace dialect {
-
-// ----INFRTDialect definition begin----
-void INFRTDialect::initialize() {
-  allowUnknownTypes();
-  allowUnknownOperations();
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-      >();
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
-      >();
-}
-
-mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  // parse TensorMapType, for example: !infrt.tensor_map
-  parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
-      << keyword;
-  return mlir::Type();
-}
-
-void INFRTDialect::printType(mlir::Type type,
-                             mlir::DialectAsmPrinter &printer) const {
-  // print TensorMapType, for example: !infrt.tensor_map
-  llvm_unreachable("unknown infrt type.");
-}
-
-// ----INFRTDialect definition end----
-
-}  // namespace dialect
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
deleted file mode 100644
index 3ef73171dcdea4e0367837f4b3893405c29a1580..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-#include "paddle/infrt/dialect/infrt_base.hpp.inc"
-
-namespace infrt {
-namespace dialect {
-
-class INFRTDialect : public mlir::Dialect {
-  explicit INFRTDialect(mlir::MLIRContext *context)
-      : mlir::Dialect(
-            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
-    initialize();
-  }
-
-  // parse types registered to the dialect.
-  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
-  // print types registered to the dialect.
-  void printType(mlir::Type type,
-                 mlir::DialectAsmPrinter &printer) const override;
-
-  void initialize();
-  friend class mlir::MLIRContext;
-
- public:
-  static ::llvm::StringRef getDialectNamespace() { return "Infrt"; }
-};
-}  // namespace dialect
-
-template <typename T>
-static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                       mlir::Location loc,
-                                       T constant) {
-  return b.getIntegerAttr(b.getI32Type(), constant);
-}
-
-template <typename T>
-static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                        mlir::Location loc,
-                                        T constant) {
-  return b.getSI32IntegerAttr(constant);
-}
-
-template <typename T>
-static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b,  // NOLINT
-                                     mlir::Location loc,
-                                     T constant) {
-  return b.getF32FloatAttr(constant);
-}
-
-static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
-    const mlir::Value &operand) {
-  return mlir::SmallVector<mlir::Value, 4>(1, operand);
-}
-
-static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
-    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<mlir::Value, 4> operands;
-  operands.append(operand_0.begin(), operand_0.end());
-  operands.append(operand_1.begin(), operand_1.end());
-  return operands;
-}
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
deleted file mode 100644
index 45e6b116f489709b1d854727870010c7545d92e7..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.td
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef INFRT_BASE
-#define INFRT_BASE
-
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
-
-def INFRT_Dialect : Dialect {
-  let name = "Infrt";
-
-  let description = [{
-    The INFRT host dialect.
-  }];
-
-  let cppNamespace = "::infrt::dialect";
-}
-
-def BufferType : OpaqueType<"b", "buffer", "buffer">;
-
-class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createSI32Attr<string value> : NativeCodeCall<
-    "infrt::createSI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createF32Attr<string value> : NativeCodeCall<
-    "infrt::createF32Attr($_builder, $_loc, " # value # ")">;
-
-def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "infrt::cvtValueToValueRange($0)">;
-
-def INFRT_concatTwoValueRange : NativeCodeCall<
-    "infrt::concatTwoValueRange($0, $1)">;
-#endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
similarity index 77%
rename from paddle/infrt/dialect/init_infrt_dialects.cc
rename to paddle/infrt/dialect/init_dialects.cc
index 5eae01719361dd5bc21c139b54cbcf16f226b4cc..56c375c72d2bbb24e1a279c6f160e0ea7a98bd83 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -12,29 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 #include <glog/logging.h>
 
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
 
 #include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
   registry.insert<ts::TensorShapeDialect,
-                  dialect::INFRTDialect,
-                  infrt::InfrtDialect,
+                  InfrtDialect,
                   dt::DTDialect,
-                  mlir::pd::PaddleDialect,
+                  pd::PaddleDialect,
+                  trt::TensorRTDialect
 #ifdef INFRT_WITH_PHI
+                  ,
                   phi::PHIDenseTensorDialect,
                   phi::PHICPUKernelDialect,
                   phi::PHIGPUKernelDialect,
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_dialects.h
similarity index 100%
rename from paddle/infrt/dialect/init_infrt_dialects.h
rename to paddle/infrt/dialect/init_dialects.h
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index e9bfc2eddb772168548141a3e86df977f57276bf..19b8cba12df8687a907534ada30cdecc1321a3c2 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -28,7 +28,7 @@
 #include <vector>
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 2f721e49a63096d1c3168805d373cbc8809542da..8ccb07161d364e968ead568f20c4b98b18a7e04e 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -22,7 +22,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
@@ -32,13 +32,13 @@ TEST(MlirLoader, basic) {
 
   auto source = R"ROC(
 func @main() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 )ROC";
 
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index 5bcf5a23f4c532b1056ceaa54c80902b32e4061a..2006530958f0b5223edfcee87a5895e101f0e240 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -14,7 +14,7 @@
 
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f65336453fbdf82f30948aeea8dc52b0367159b
--- /dev/null
+++ b/paddle/infrt/dialect/pd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(common)
+add_subdirectory(ir)
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee1b0d4c30deb2e7fbf19aa91ec3dd3bdcd449af
--- /dev/null
+++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8aacfc97623c0dadc0ccb604440ce19427d860ba
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_ops.cc
+    )
+add_mlir_dialect(pd_ops pd)
+mlir_tablegen_on(pd_extra_ops)
diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
similarity index 90%
rename from paddle/infrt/dialect/pd_extra_ops.td
rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td
index c6d3f530455f76d0352ef5ac42297c30ce521da2..cf17db211cbe98c586423c7db050dfdc12576cff 100644
--- a/paddle/infrt/dialect/pd_extra_ops.td
+++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/pd_op_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
 
 def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
     let summary = "Computes the Fully Connected result of two tensors";
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td
similarity index 83%
rename from paddle/infrt/dialect/pd_op_base.td
rename to paddle/infrt/dialect/pd/ir/pd_op_base.td
index 26425e3945caa2f85547b7b8e8be7dbeaf10e630..e28854a848023c1161c8cda24edb705f536b5698 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td
@@ -6,9 +6,9 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
-def PD_Dialect : Dialect {
+def Paddle_Dialect : Dialect {
   let name = "pd";
 
   let description = [{
@@ -16,16 +16,16 @@ def PD_Dialect : Dialect {
 
     This dialect contains the PaddlePaddle operators.
   }];
-
-  let cppNamespace = "mlir::pd";
+  let hasConstantMaterializer = 1;
+  let cppNamespace = "infrt::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
-      Op<PD_Dialect, mnemonic, traits>;
+      Op<Paddle_Dialect, mnemonic, traits>;
 
 
 class PD_PaddleAttr <string name, string description> :
-      Attr<CPred<"$_self.isa<mlir::pd::" # name # "Attr>()">,
+      Attr<CPred<"$_self.isa<infrt::pd::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
 
@@ -33,12 +33,12 @@ class PD_PaddleAttr <string name, string description> :
 // PaddlePaddle type definitions
 //===----------------------------------------------------------------------===//
 
-def PD_PDDialectType : Type<CPred<"$_self.isa<mlir::pd::PDType>()">, "PaddlePaddle type">;
+def PD_PDDialectType : Type<CPred<"$_self.isa<infrt::pd::PDType>()">, "PaddlePaddle type">;
 
 class PD_PaddleType <string name, string description> :
-      Type<CPred<"$_self.isa<mlir::pd::" # name #"Type>()">,
+      Type<CPred<"$_self.isa<infrt::pd::" # name #"Type>()">,
          "Paddle " # description # " type">,
-      BuildableType<"getType<mlir::pd::" # name # "Type>()">;
+      BuildableType<"getType<infrt::pd::" # name # "Type>()">;
 
 //===----------------------------------------------------------------------===//
 // Integer types
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ba48581ee62f4e77328ed9f91ad956632dbbb7
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace pd {
+void PaddleDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+      ,
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+      >();
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+void ConstantOp::build(mlir::OpBuilder &builder,
+                       mlir::OperationState &state,
+                       mlir::Attribute value) {
+  if (auto elem_attr = value.dyn_cast<mlir::ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<mlir::BoolAttr, mlir::FloatAttr, mlir::IntegerAttr>()) {
+    mlir::ShapedType type =
+        mlir::RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", mlir::DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+mlir::LogicalResult ConstantOp::inferReturnTypes(
+    mlir::MLIRContext *context,
+    mlir::Optional<mlir::Location> location,
+    mlir::ValueRange operands,
+    mlir::DictionaryAttr attributes,
+    mlir::RegionRange regions,
+    llvm::SmallVectorImpl<mlir::Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return mlir::success();
+}
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
+  return value();
+}
+}  // namespace pd
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8383ff6ed8201c4f8948ebaa4effaac3d783cc52
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..827df597b76e2ec5b4cf639c984a425f9be8b6c9
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(pd_op_fuse)
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
similarity index 95%
rename from paddle/infrt/dialect/rewrite.td
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td
index 5e228fed4d57eb283705c725797c42c5da133c3f..f5a8ea78d7d9da5cc70b50d31836b4f4933d5853 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
@@ -1,10 +1,10 @@
 #ifndef INFRT_REWRITE
 #define INFRT_REWRITE
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/pd_ops.td"
-include "paddle/infrt/dialect/pd_extra_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
 
 //===----------------------------------------------------------------------===//
 // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8bdf957db27d8c2b20025931a76826628feddbdd
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+namespace {
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * PdOpFusePass.
+ */
+struct PdOpFusePass
+    : public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PdOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
+
+  void runOnFunction() override;
+};
+
+// Implementation of the PdOpFusePass.
+void PdOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+}  // namespace
+
+mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
similarity index 71%
rename from paddle/infrt/dialect/pd_types.cc
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
index 94856e362d301978970279846907f41dfbc00b56..854545ab1a2638224e16a300bfccb1f953f81c77 100644
--- a/paddle/infrt/dialect/pd_types.cc
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,4 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pd_types.h"
+#pragma once
+#include <mlir/Pass/Pass.h>
+
+namespace infrt {
+/*
+ * PdOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
deleted file mode 100644
index 55ab174fcaf059d81f83e54e8f1e5864ef25b7e3..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_ops.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/pd_ops.h"
-
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-
-namespace mlir {
-namespace pd {
-
-#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
-
-PaddleDialect::PaddleDialect(MLIRContext *context)
-    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-      ,
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-      >();
-}
-
-mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
-                                                    mlir::Attribute value,
-                                                    mlir::Type type,
-                                                    mlir::Location loc) {
-  return builder.create<ConstantOp>(loc, value);
-}
-
-void ConstantOp::build(OpBuilder &builder,
-                       OperationState &state,
-                       Attribute value) {
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    return ConstantOp::build(builder, state, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    state.addAttribute("value", DenseElementsAttr::get(type, value));
-    state.addTypes(type);
-    return;
-  }
-  llvm_unreachable("unsupported attribute type for building pd.constant");
-}
-
-LogicalResult ConstantOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(attributes.get("value").getType());
-  return success();
-}
-mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<mlir::Attribute> operands) {
-  return value();
-}
-/*
-LogicalResult ElementwiseAdd::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-*/
-
-void Elementwise_addOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseMulAdd>(context);
-}
-
-/*
-mlir::OpFoldResult ElementwiseAdd::fold(
-    llvm::ArrayRef<mlir::Attribute> operands) {
-  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
-    if (!operands[0] || !operands[1]) return {};
-    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
-    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
-    if (!lhs || !rhs) return {};
-    ShapedType type = getType().template cast<ShapedType>();
-    if (!type.hasStaticShape()) return {};
-    Type etype = type.getElementType();
-    if (!etype.isa<FloatType>()) return {};
-    SmallVector<APFloat, 6> values;
-    values.reserve(lhs.getNumElements());
-    for (const auto zip :
-         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
-      values.push_back(
-          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
-    }
-    return DenseElementsAttr::get(type, values);
-  }
-  return {};
-}
-
-LogicalResult ElementwiseDiv::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseMul::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseSub::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult MulOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-void ReluOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseFCRelu>(context);
-}
-
-void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseRepeatedFCRelu2>(context);
-}
-
-void BatchNormOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseBatchNormWithConvPattern>(context);
-}*/
-
-}  // namespace pd
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
deleted file mode 100644
index 41dd2ddd94eb161735568170a9a8bdc2ec259cdf..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_ops.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/Dialect/Traits.h>
-#include <mlir/IR/Attributes.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/BuiltinOps.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/Interfaces/CallInterfaces.h>
-#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
-#include <mlir/Interfaces/InferTypeOpInterface.h>
-#include <mlir/Interfaces/LoopLikeInterface.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-
-namespace mlir {
-namespace pd {
-
-class PaddleDialect : public Dialect {
- public:
-  explicit PaddleDialect(MLIRContext* context);
-
-  static StringRef getDialectNamespace() { return "pd"; }
-
-  /// A hook used to materialize constant values with the given type.
-  Operation* materializeConstant(OpBuilder& builder,
-                                 Attribute value,
-                                 Type type,
-                                 Location loc) override;
-
-  Type parseType(DialectAsmParser& parser) const override {
-    return Dialect::parseType(parser);
-  }
-  void printType(Type type, DialectAsmPrinter& printer) const override {
-    Dialect::printType(type, printer);
-  }
-};
-
-}  // namespace pd
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
deleted file mode 100644
index 0da888a9c076922fc21d5cce004dc839bd705762..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_types.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file defines the types used in PaddlePaddle MLIR dialect.
-// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
-// tensorflow).
-
-#pragma once
-
-#include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Location.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-namespace mlir {
-namespace PD {
-
-class PaddleType : public Type {
- public:
-  using Type::Type;
-
-  static bool classof(Type type);
-};
-
-namespace detail {
-
-template <typename Derived>
-class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> {
- public:
-  using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
-  using PDBase = PaddleTypeImpl<Derived>;
-  using Base::Base;
-};
-
-}  // namespace detail
-
-#define HANDLE_PD_TYPE(pdtype, enumerant, name)                      \
-  class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
-   public:                                                           \
-    using PDBase::PDBase;                                            \
-  };
-
-}  // namespace PD
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 4e73a533d99a79168b3e68b88d917f48ec811444..67f6bb8a2d7bbfa604614e4909169c08ea18e1b3 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -5,9 +5,6 @@ endif()
 add_subdirectory(ir)
 add_subdirectory(pass)
 
-add_executable(phi-ir-exec phi_ir_exec.cc)
-target_link_libraries(phi-ir-exec infrt)
-
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
 
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
index 5da7ec8831258e52dd157ff444ffcd6e7930e1bb..bbc296ea748a39472bb7f57b04e9159b5fbd89f1 100644
--- a/paddle/infrt/dialect/phi/data_type.cc
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -16,7 +16,7 @@
 
 namespace infrt {
 
-phi::Backend cvtTarget2Phi(TargetType target) {
+phi::Backend ConvertTargetToPhi(TargetType target) {
   switch (target) {
     case TargetType::CPU:
       return phi::Backend::CPU;
@@ -27,7 +27,7 @@ phi::Backend cvtTarget2Phi(TargetType target) {
   }
 }
 
-TargetType cvtTargetFromPhi(phi::Backend backend) {
+TargetType ConvertTargetFromPhi(phi::Backend backend) {
   switch (backend) {
     case phi::Backend::CPU:
       return TargetType::CPU;
@@ -38,7 +38,7 @@ TargetType cvtTargetFromPhi(phi::Backend backend) {
   }
 }
 
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+phi::DataType ConvertPrecisionToPhi(PrecisionType precision) {
 #define CONVERT_PRECISION_TO_PHI(Precision) \
   case PrecisionType::Precision:            \
     return phi::DataType::Precision;
@@ -61,7 +61,7 @@ phi::DataType cvtPrecision2Phi(PrecisionType precision) {
 #undef CONVERT_PRECISION_TO_PHI
 }
 
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype) {
 #define CONVERT_PRECISION_FROM_PHI(Precision) \
   case phi::DataType::Precision:              \
     return PrecisionType::Precision;
@@ -84,7 +84,7 @@ PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
 #undef CONVERT_PRECISION_FROM_PHI
 }
 
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+phi::DataLayout ConvertLayoutToPhi(LayoutType layout) {
   switch (layout) {
     case LayoutType::NCHW:
       return phi::DataLayout::NCHW;
@@ -97,7 +97,7 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) {
   }
 }
 
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+LayoutType ConvertLayoutFromPhi(phi::DataLayout layout) {
   switch (layout) {
     case phi::DataLayout::NCHW:
       return LayoutType::NCHW;
@@ -110,16 +110,16 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
   }
 }
 
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
+phi::KernelKey ConvertPlaceToPhi(const Place& place) {
+  return phi::KernelKey(ConvertTargetToPhi(place.target),
+                        ConvertLayoutToPhi(place.layout),
+                        ConvertPrecisionToPhi(place.precision));
 }
 
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
+Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(ConvertTargetFromPhi(tensor_arg.backend),
+               ConvertPrecisionFromPhi(tensor_arg.dtype),
+               ConvertLayoutFromPhi(tensor_arg.layout));
 }
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
index b618ef3861303334b697382f11bfa4fdb4a35c7a..bd258cb1038792e52667b0ef39c65b16c6210eb3 100644
--- a/paddle/infrt/dialect/phi/data_type.h
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
@@ -23,16 +23,16 @@
 
 namespace infrt {
 
-phi::Backend cvtTarget2Phi(TargetType target);
-TargetType cvtTargetFromPhi(phi::Backend backend);
+phi::Backend ConvertTargetToPhi(TargetType target);
+TargetType ConvertTargetFromPhi(phi::Backend backend);
 
-phi::DataType cvtPrecision2Phi(PrecisionType precision);
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype);
+phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
+PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype);
 
-phi::DataLayout cvtLayout2Phi(LayoutType layout);
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout);
+phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
+LayoutType ConvertLayoutFromPhi(phi::DataLayout layout);
 
-phi::KernelKey cvtPlace2Phi(const Place& place);
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg);
+phi::KernelKey ConvertPlaceToPhi(const Place& place);
+Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg);
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index 671646b9259ccfd2399862d71d6860db93608eb8..5d7338ec4292ed49112c3cce45a30816e686886d 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -2,7 +2,7 @@
 #define PHI_BASE
 
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def PHI_Dialect : Dialect {
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index ee23470fc754a56ef323c167613f7f32982eedd8..d2ff7acfba8b26f5c0ca1ec459d3b5e2f7fb3d93 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def PHI_CPUKernelDialect : Dialect {
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 21c4669b645fb6c7622fb01ae1c7bacaee0f5ca2..1fda2d9d8886008c6415b5a1cf36d53c1500707a 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -5,7 +5,7 @@
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
 def PHI_DenseTensorDialect : Dialect {
   let name = "phi_dt";
@@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect {
 class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
   mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp 
-      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+class CreateDenseTensorOp<string target>
+      : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
   let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
     LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
@@ -51,9 +51,11 @@ class CreateContextOp<string target>
   let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
+def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
+def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
 def PDT_PrintDenseTensor : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index d8095d7f3f13fcfbf9b2ccab6db182850633d632..f91381fe729034b3e2d36068dce43d531bfedc1c 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -29,6 +29,7 @@ namespace infrt {
 namespace phi {
 
 void PHIDialect::initialize() {
+  LOG(INFO) << "PHI Dialect initalized";
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index 0ea1973a7331b8a34bf2a286cb55e19a4d09118b..64cd08cc05ed42fe8d53b8c5b8a5bc994bae8824 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,7 +18,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
index b84d1b2b7294baf789fe4e1f3911edede8172cf7..4f8b41852cc67e32c510c247e907092046731452 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.h
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -30,7 +30,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
index 5c55a6b0acaed7be9ee86b4662d895d08ca05bdc..dc60ecf63fe2eaeffc11ac932e594274c01f8580 100644
--- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -2,6 +2,8 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     proto_arg_map_context.cc
-    phi_op_cvt_pass.cc
+    phi_op_convert_pass.cc
     kernel_op_desc.cc
-    )
+   )
+
+cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt)
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index d1763897b4a1320179134ede14fe404aee4a6a76..a26e8e2dca57081d9935883bbe0f01188abf1f1b 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -73,14 +73,14 @@ std::string getPhiLayoutSuffix(LayoutType layout) {
   }
 }
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
   phi::KernelKeyMap kernel_key_map =
       phi::KernelFactory::Instance().SelectKernelMap(name);
   for (Place place : valid_palces) {
-    phi::KernelKey kernel_key = cvtPlace2Phi(place);
+    phi::KernelKey kernel_key = ConvertPlaceToPhi(place);
     if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
       kernel_key = phi::KernelKey(kernel_key.backend(),
                                   phi::DataLayout::ALL_LAYOUT,
@@ -88,19 +88,20 @@ std::vector<PhiKernelDesc> getCandidateKernels(
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
       place.layout = LayoutType::ANY;
     }
-    phi_kernel_desc.kernelType = place;
-    phi_kernel_desc.inputsType.clear();
-    phi_kernel_desc.outputsType.clear();
+    phi_kernel_desc.kernel_type = place;
+    phi_kernel_desc.input_types.clear();
+    phi_kernel_desc.output_types.clear();
     phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
     const paddle::SmallVector<phi::TensorArgDef>& input_arg =
         args_def.input_defs();
     const paddle::SmallVector<phi::TensorArgDef>& output_arg =
         args_def.output_defs();
     for (auto tensor_arg : input_arg) {
-      phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg));
     }
     for (auto tensor_arg : output_arg) {
-      phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.output_types.emplace_back(
+          ConvertPlaceFromPhi(tensor_arg));
     }
     candidate_kernels.emplace_back(phi_kernel_desc);
   }
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index 34fd2f0f62dcd9b793f9157003bfd3772d0e1307..cdc8f7cbff553687bed63d165b18c4bc8efdb807 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -16,21 +16,21 @@
 
 #include <string>
 #include <vector>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
 struct PhiKernelDesc {
-  std::vector<Place> inputsType;   // kernel input place
-  std::vector<Place> outputsType;  // kernel output place
-  Place kernelType;                // kernel place
+  std::vector<Place> input_types;   // kernel input place
+  std::vector<Place> output_types;  // kernel output place
+  Place kernel_type;                // kernel place
 };
 
 std::string getPhiTargetPrefix(TargetType target);
 std::string getPhiPrecisionSuffix(PrecisionType precision);
 std::string getPhiLayoutSuffix(LayoutType layout);
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd5f0799a60d5d3925e1e1265997820c37b438e6
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/phi/kernels/declarations.h"
+
+namespace infrt {
+
+TEST(phi, get_op_desc) {
+  std::vector<Place> places;
+  places.emplace_back(
+      TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW);
+  auto kernels = GetCandidateKernels("addmm", places);
+  ASSERT_GE(kernels.size(), 1UL);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13cba6eeabb669cf93deb9a37d87d2ddff66e5c0
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/ops/compat/signatures.h"
+
+namespace {
+class PhiOpConvertPass
+    : public mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; }
+  void runOnFunction() override;
+  PhiOpConvertPass();
+  explicit PhiOpConvertPass(const std::vector<infrt::Place> &valid_places)
+      : valid_places_(valid_places) {}
+
+  PhiOpConvertPass(const PhiOpConvertPass &other)
+      : mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass>(*this),
+        valid_places_(other.valid_places_) {}
+
+  ::llvm::StringRef getArgument() const override { return "phi-op-convert"; }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override;
+
+ private:
+  void convertStage();
+  void dispatchStage();
+
+  // Force a specified data format for all layout sensitive operations.
+  Option<std::string> valid_places_options_{
+      *this,
+      "valid-targets",
+      llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")};
+
+  std::vector<infrt::Place> valid_places_;
+};
+// Implementation of the PhiOpConvertPass.
+void PhiOpConvertPass::runOnFunction() {
+  convertStage();
+  dispatchStage();
+}
+
+void PhiOpConvertPass::convertStage() {
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
+  for (auto &op : body.without_terminator()) {
+    worklist.push_back(&op);
+  }
+  mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (!op) continue;
+
+    auto op_name = op->getName().getIdentifier().str();
+
+    // only convert op in pd dialect.
+    if (op_name.substr(0, 3) != "pd.") continue;
+    op_name = op_name.substr(3);
+    if (pd_dialect_inputs_info_map_.find(op_name) ==
+            pd_dialect_inputs_info_map_.end() ||
+        pd_dialect_outputs_info_map_.find(op_name) ==
+            pd_dialect_outputs_info_map_.end()) {
+      LOG(WARNING) << "No op info found for " << op_name;
+      // Todo: print log
+      continue;
+    }
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
+      std::string kernel_name = phi::TransToPhiKernelName(op_name);
+      auto kernel_op = builder.create<infrt::KernelOp>(loc,
+                                                       op->getResultTypes(),
+                                                       op->getOperands(),
+                                                       kernel_name,
+                                                       op->getAttrDictionary());
+      op->replaceAllUsesWith(kernel_op.getResults());
+    } else {
+      ::phi::KernelSignature kernel_sign =
+          ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+              infrt::ProtoArgumentMappingContext(op));
+      // resort input&output according to kernel_sign
+      ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+      ::llvm::SmallVector<mlir::Type, 4> output_types;
+      for (const std::string &str : std::get<0>(kernel_sign.args)) {
+        if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No input info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+        inputs.push_back(op->getOperands()[index]);
+      }
+
+      for (const std::string &str : std::get<2>(kernel_sign.args)) {
+        if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No output info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+        output_types.push_back(op->getResultTypes()[index]);
+        ori_output.push_back(op->getResult(index));
+      }
+      auto kernel_op = builder.create<infrt::KernelOp>(
+          loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+      for (size_t index = 0; index < ori_output.size(); ++index) {
+        ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
+      }
+    }
+    CHECK(op->use_empty());
+    op->erase();
+  }
+}
+
+void PhiOpConvertPass::dispatchStage() {
+  std::vector<infrt::KernelOp> worklist;
+  mlir::Block &block = getFunction().front();
+  for (auto &op : block) {
+    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
+    if (nullptr != kernel_op) worklist.push_back(kernel_op);
+  }
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<infrt::TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<infrt::PhiKernelDesc> candidates =
+        GetCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name =
+        infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) +
+        kernel_name +
+        infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) +
+        infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout);
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernel_type.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernel_type.target) {
+        case infrt::TargetType::CPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateCPUContextOp>(
+                      kernel_op.getLoc(),
+                      infrt::phi::ContextType::get(kernel_op.getContext(),
+                                                   infrt::TargetType::CPU))
+                  .output();
+          phi_context[infrt::TargetType::CPU] = context_value;
+        } break;
+        case infrt::TargetType::GPU:
+        case infrt::TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernel_type.target));
+
+    for (size_t index = 0; index < phi_kernel_desc.input_types.size();
+         ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(),
+          infrt::DenseTensorType::get(
+              kernel_op.getContext(),
+              phi_kernel_desc.input_types[index].target,
+              phi_kernel_desc.input_types[index].precision,
+              phi_kernel_desc.input_types[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      operation_state.addTypes(infrt::DenseTensorType::get(
+          kernel_op.getContext(),
+          phi_kernel_desc.output_types[index].target,
+          phi_kernel_desc.output_types[index].precision,
+          phi_kernel_desc.output_types[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
+  }
+}
+
+PhiOpConvertPass::PhiOpConvertPass() {
+  if (!valid_places_options_.hasValue()) {
+    valid_places_.emplace_back(infrt::TargetType::CPU,
+                               infrt::PrecisionType::FLOAT32,
+                               infrt::LayoutType::NCHW);
+    return;
+  }
+
+  LOG(FATAL) << "To be done for specifying places in command line";
+}
+
+void PhiOpConvertPass::getDependentDialects(
+    mlir::DialectRegistry &registry) const {
+  registry.insert<infrt::InfrtDialect>();
+  registry.insert<infrt::phi::PHIDialect>();
+  registry.insert<infrt::phi::PHIDenseTensorDialect>();
+  registry.insert<infrt::phi::PHICPUKernelDialect>();
+  registry.insert<infrt::phi::PHIGPUKernelDialect>();
+}
+
+}  // namespace
+
+mlir::PassRegistration<PhiOpConvertPass> phi_op_convert;
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass(
+    std::vector<Place> valid_places) {
+  return std::make_unique<PhiOpConvertPass>(valid_places);
+}
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass() {
+  return std::make_unique<PhiOpConvertPass>();
+}
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a2c0ee96ed0de120f8667d8f2fb91314c02e9ac
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt/common/types.h"
+
+namespace infrt {
+/*
+ * phiOpCvtPass.
+ * Convert the general operators from pd Dialect to phi dialect.
+ */
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass(std::vector<Place> valid_places);
+
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
deleted file mode 100644
index fb00a3de3fc0c82dce2489c0f412c64118e3101e..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
-
-#include <glog/logging.h>
-#include <llvm/ADT/SetVector.h>
-#include <mlir/Analysis/SliceAnalysis.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/OperationSupport.h>
-#include <list>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
-#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/ops/compat/signatures.h"
-namespace infrt {
-// Implementation of the phiOpCvtPass.
-void phiOpCvtPass::runOnFunction() {
-  convertStage();
-  diapatchStage();
-}
-void phiOpCvtPass::convertStage() {
-  mlir::Block &body = getFunction().front();
-  std::vector<mlir::Operation *> worklist;
-  for (auto &op : body.without_terminator()) {
-    worklist.push_back(&op);
-  }
-  mlir::OpBuilder builder(&body, body.begin());
-  while (!worklist.empty()) {
-    auto *op = worklist.back();
-    worklist.pop_back();
-    if (op == nullptr) continue;
-
-    std::string op_name = op->getName().getIdentifier().str();
-
-    // only convert op in pd dialect.
-    if (op_name.substr(0, 3) != "pd.") continue;
-    op_name = op_name.substr(3);
-    if (pd_dialect_inputs_info_map_.find(op_name) ==
-            pd_dialect_inputs_info_map_.end() ||
-        pd_dialect_outputs_info_map_.find(op_name) ==
-            pd_dialect_outputs_info_map_.end()) {
-      // Todo: print log
-      continue;
-    }
-
-    ::phi::KernelSignature kernel_sign =
-        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
-            ProtoArgumentMappingContext(op));
-    // resort input&output according to kernel_sign
-    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
-    ::llvm::SmallVector<mlir::Type, 4> output_types;
-    for (const std::string &str : std::get<0>(kernel_sign.args)) {
-      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
-      inputs.push_back(op->getOperands()[index]);
-    }
-
-    for (const std::string &str : std::get<2>(kernel_sign.args)) {
-      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
-      output_types.push_back(op->getResultTypes()[index]);
-      ori_output.push_back(op->getResult(index));
-    }
-
-    auto loc = getFunction().getLoc();
-    builder.setInsertionPoint(op);
-    auto kernel_op = builder.create<infrt::KernelOp>(
-        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
-    for (size_t index = 0; index < ori_output.size(); ++index) {
-      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
-    }
-    if (!op->use_empty()) {
-      // Todo: print error log
-      return;
-    }
-    op->erase();
-  }
-}
-void phiOpCvtPass::diapatchStage() {
-  std::vector<infrt::KernelOp> worklist;
-  mlir::Block &block = getFunction().front();
-  for (auto &op : block) {
-    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
-    if (nullptr != kernel_op) worklist.push_back(kernel_op);
-  }
-
-  mlir::OpBuilder builder(&block, block.begin());
-  std::map<TargetType, mlir::Value> phi_context;
-  for (infrt::KernelOp kernel_op : worklist) {
-    std::string kernel_name = kernel_op.name().str();
-    std::vector<PhiKernelDesc> candidates =
-        getCandidateKernels(kernel_name, valid_places_);
-    if (candidates.empty()) {
-      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
-      continue;
-    }
-    builder.setInsertionPoint(kernel_op);
-
-    // Todo: Implimentation the concrete pass pick strategy
-    const PhiKernelDesc &phi_kernel_desc = candidates.front();
-
-    kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
-                  kernel_name +
-                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
-                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
-
-    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
-    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
-
-    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
-        phi_context.end()) {
-      switch (phi_kernel_desc.kernelType.target) {
-        case TargetType::CPU: {
-          auto context_value =
-              builder
-                  .create<infrt::phi::CreateCPUContextOp>(
-                      kernel_op.getLoc(),
-                      phi::ContextType::get(kernel_op.getContext(),
-                                            TargetType::CPU))
-                  .output();
-          phi_context[TargetType::CPU] = context_value;
-        } break;
-        case TargetType::GPU:
-        case TargetType::UNK:
-        default:
-          LOG(FATAL) << "Unsupported TargetType";
-          break;
-      }
-    }
-    operation_state.addOperands(
-        phi_context.at(phi_kernel_desc.kernelType.target));
-    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
-      mlir::Value input = kernel_op.getOperand(index);
-      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
-          kernel_op.getLoc(),
-          DenseTensorType::get(kernel_op.getContext(),
-                               phi_kernel_desc.inputsType[index].target,
-                               phi_kernel_desc.inputsType[index].precision,
-                               phi_kernel_desc.inputsType[index].layout),
-          input);
-      operation_state.addOperands(cvt_tensor_type_op.output());
-    }
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      operation_state.addTypes(
-          DenseTensorType::get(kernel_op.getContext(),
-                               phi_kernel_desc.outputsType[index].target,
-                               phi_kernel_desc.outputsType[index].precision,
-                               phi_kernel_desc.outputsType[index].layout));
-    }
-    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
-    mlir::Operation *phi_operation = builder.createOperation(operation_state);
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      mlir::Value input = phi_operation->getResult(index);
-      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
-          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
-      kernel_op.getResult(index).replaceAllUsesWith(
-          cvt_tensor_type_op.output());
-    }
-    kernel_op.erase();
-  }
-}
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
deleted file mode 100644
index 051fee9b61a24772ff2295280fa1b0a1588d7bae..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
-
-namespace infrt {
-/*
- * phiOpCvtPass.
- *
- * Convert the general operators in pd Dialect to a infrt.kernelOp.
- *
- * source func:
- *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "pd.conv2d"(%a) ...
- *  %d = "pd.conv3d"(%c) ...
- *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
- * }
- *
- * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "infrt.kernel"(%a){name = "conv2d"} ...
- *  %d = "infrt.kernel"(%c){name = "conv3d"}...
- *  %f = "infrt.kernel"(%a){name = "conv2d"}...
- *  "pd.fetch" (%d, %f)
- * }
- */
-class phiOpCvtPass
-    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
- public:
-  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
-  void runOnFunction() override;
-  explicit phiOpCvtPass(std::vector<Place> valid_places = std::vector<Place>())
-      : valid_places_(valid_places) {}
-
- private:
-  void convertStage();
-  void diapatchStage();
-  std::vector<Place> valid_places_;
-};
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 64b184359700ee2625e3c61d21617619a50771e3..1cd5b5a85511fe20e8029185caf4c93d95979b72 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
 }
+bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
 
 bool ProtoArgumentMappingContext::IsDenseTensorOutput(
     const std::string& name) const {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8..5cf2ef979076d697f1991ad33cd38c36dda16cab 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <mlir/IR/Operation.h>
 #include <unordered_map>
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
@@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
 
   bool IsDenseTensorInput(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
 
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
index 559fb90a64a7868c9c150e12e881d73df7a4aaf2..0beb5bff29f6df73be75a18611a5207bb1e3aad7 100644
--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -18,7 +18,7 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
@@ -38,7 +38,7 @@ int main(int argc, char** argv) {
   std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
                                              infrt::PrecisionType::FLOAT32,
                                              infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
   phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index a37df265955e70cdf735f251bc8853c7ad4fe831..b118a5f7a9caf42f4aa63dd0222e7a2647addac5 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -31,7 +31,7 @@
 #include <iostream>
 
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace cl = llvm::cl;
 
diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td
index d3714c8ed14d3f1aea50ec4c55a9c4c2fb85e958..2be21d6aa772020519e3d909c9bdf7232f7ff985 100644
--- a/paddle/infrt/dialect/tensor_shape.td
+++ b/paddle/infrt/dialect/tensor_shape.td
@@ -2,7 +2,7 @@
 #else
 #define INFRT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 68ca1559acee03580eea0842bfbac3593d418c02..6467c1285f85e0c8bfca7b873ce64a09a52074ff 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -2,8 +2,8 @@
 #define PD_LOWER_TO_TRT
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2..0878163a955af236c6a40f60850e9e5cad67b2aa 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -17,11 +17,12 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
-#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
 namespace infrt {
 namespace trt {
 namespace {
@@ -54,8 +55,8 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 
 // merge the first&second graph op to a new graph op.
 void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
-                             mlir::pd::GraphOp first,
-                             mlir::pd::GraphOp second) {
+                             infrt::pd::GraphOp first,
+                             infrt::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,7 +85,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  auto graph_op = builder.create<infrt::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -149,13 +150,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+      infrt::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
+        infrt::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 803e53e3244f92134928e1105a8248e9f49e5432..18afba19e06189294078bcfc1a0b2bb341eb7126 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -28,17 +27,17 @@ namespace trt {
  * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "infrt.return" (%m)
+ *     infrt.return %m...
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "infrt.return" (%m)
+ *      infrt.return %m...
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "infrt.return" (%m)
+ *      infrt.return %m...
  *  } ...
- *  "infrt.return" (%d, %f)..
+ *  infrt.return %d, %f :...
  * }
  *
  * destination func:
@@ -47,9 +46,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "infrt.return" (%n, %s)
+ *     infrt.return %n, %s:...
  *  } ...
- *  "infrt.return" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphFusePass
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index e3a7b455024c65d40ccbafb28fba9e9b0ead0369..ade61bfc370f550cf85267b3088d697bf1bea997 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,24 +15,24 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<mlir::pd::GraphOp> worklist;
+  std::vector<infrt::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+    infrt::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    mlir::pd::GraphOp graph_op = worklist.back();
+    infrt::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 1c44a13cf9dfb65a1747a596dc1012e7f54d792e..a5dd4f14b2946fe232b7b725f6ace7caf74ff4d4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -31,9 +30,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "infrt.return" (%n, %s)...
+ *     infrt.return %n, %s : ...
  *  } ...
- *  "infrt.return" (%d, %f)...
+ *  infrt.return %d, %f : ...
  * }
  *
  * destination func:
@@ -41,7 +40,7 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "infrt.return" (%d, %f)...
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphSplitPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 1be5f4dbc39d7699b6d8a36cfb3e164694e908c1..19c6b13e971ec779ed178413ca08b42b23dc71d1 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -14,8 +14,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include <mlir/IR/Builders.h>
 #include <mlir/Transforms/DialectConversion.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
@@ -24,11 +23,11 @@ namespace trt {
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
 struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
-  PD2TRT_GraphLower(::mlir::MLIRContext *context)
+  explicit PD2TRT_GraphLower(::mlir::MLIRContext *context)
       : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
   ::mlir::LogicalResult matchAndRewrite(
       ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
-    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    auto casted_op = ::llvm::dyn_cast<infrt::pd::GraphOp>(op);
     ::mlir::Operation::operand_range inputs = casted_op.inputs();
     auto ods_loc = rewriter.getFusedLoc(op->getLoc());
     CreateEngineOp create_engine_op;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index 7550d8c84e19504fc0f41067c1194703a55410ba..ede64f8bcd556a73b779fc3b772bf3fa8f74eaf9 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -29,9 +29,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "infrt.return" (%n, %s)...
+ *     infrt.return %n, %s:...
  *   } ...
- *   "infrt.return" (%d, %f)...
+ *   infrt.return %d, %f:...
  * }
  *
  * destination ir:
@@ -40,10 +40,10 @@ namespace trt {
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "infrt.return" (%n, %s)...
+ *     infrt.return %n, %s :...
  *   }){run_once = true} ...
  *   %d, %f = "trt.execute"(%engine, %a)...
- *   "infrt.return" (%d, %f)...
+ *   infrt.return %d, %f :...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 13b7f1aee55d2a2d30822a878bbd50d385411f43..ef9ccc82678f4bf2e2b518bf346d25393b9e480c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -15,9 +15,9 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -35,13 +35,13 @@ void TRTOpTellerPass::runOnFunction() {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<mlir::pd::GraphOp>(
+    auto graph_op = builder.create<infrt::pd::GraphOp>(
         loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index b9e461c8633d906fd46e9f7d6799e8a157915048..1cb08dc0a2161eeb5720191bada52f9b54e94893 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -29,24 +28,24 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "infrt.return"(%d, %f) ...
+ *  infrt.return %d, %f: ...
  * }
  *
  * destination func:
  * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "infrt.return" (%m)
+ *     infrt.return %m:...
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
- *  "infrt.return" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index d5222976625a2adece9a87c8952dba10137ae9ba..415a78a6967ab6fd4e2a38380d09a5d5c64b1c2f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -21,6 +21,10 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
 namespace infrt {
 namespace trt {
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 44444232915bad7d25b0ecedfa8e8427f4567e49..76768037dbdb3072976d9f6cf0cdfb4f7956bdd4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,9 +28,9 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 132a1d7805bdb85af8716e384ec29357a6ff68ad..803a11ed5b7e5ce46211a85471536c0300d42630 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "trt CreateEngine Op";
@@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs TRT_EngineType:$output);
+  let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
+  let results = (outs TRT_EngineType:$engine);
 }
 
 def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
@@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
   let description = [{
     Describe a tensorrt runtime.
   }];
-  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
-  let results = (outs Variadic<TRT_Tensor>:$output);
+  let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
+  let results = (outs Variadic<DenseTensor>:$output);
+}
+
+def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
+  let summary = "trt compute engine";
+  let description = [{
+    execute engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Context:$context);
+  let results = (outs DenseTensorList:$outputs);
+}
+
+def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
+  let summary = "trt inspect engine";
+  let description = [{
+    Show engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
@@ -34,11 +53,44 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
     TensorRT IActivationLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+  let arguments = (ins  DenseTensor:$input, SI32Attr:$activation_type,
                         DefaultValuedAttr<F32Attr, "0.0">:$alpha,
                         DefaultValuedAttr<F32Attr, "0.0">:$beta);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
+}
+
+def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> {
+  let summary = "TensorRT IFullyConnectedLayer";
+  let description = [{
+    TensorRT IFullyConnectedLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
+}
+
+def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> {
+  let summary = "TensorRT IConvolutionLayer";
+  let description = [{
+    TensorRT IConvolutionLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num,
+    I32ArrayAttr:$kernel_size
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
 }
 
 def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
@@ -48,9 +100,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
     TensorRT IElementWiseLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+  let arguments = (ins  DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
@@ -60,10 +112,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
     TensorRT IMatrixMultiplyLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
-                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+  let arguments = (ins  DenseTensor:$input1, BoolAttr:$transpose1,
+                        DenseTensor:$input2, BoolAttr:$transpose2);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 #endif  // TRT_OPS
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
index 1a7ea854c9ce469ee5719743287b4ee1b5de9286..843b12ced21a982b18b5a63f7bbef1d4d24eea16 100644
--- a/paddle/infrt/external_kernels/basic.mlir
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -1,7 +1,7 @@
 // CHECK: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
   %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
@@ -17,5 +17,5 @@ func @basic() -> f32 {
   // CHECK: 6
   "external.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
index b0cabddc3ebc4a9ede73d506ac58acaa140f03d5..26b2d24cace70455d4a0e21dddf23c9bd628ae81 100644
--- a/paddle/infrt/external_kernels/fc.mlir
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -1,43 +1,43 @@
 // CHECK-LABEL: @fc
-func @fc(%input : !Infrt.tensor<X86, NCHW, F32>,
-         %w : !Infrt.tensor<X86, NCHW, F32>,
-         %bias : !Infrt.tensor<X86, NCHW, F32>) -> !Infrt.tensor<X86, NCHW, F32>
+func @fc(%input : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %w : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %bias : !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 {
-  %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  // dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
   // fc1
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
   // fc2
-  "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%out, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @benchmark
 func @benchmark() {
-  %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  Infrt.benchmark "add.f32"(
-          %input:!Infrt.tensor<X86, NCHW, F32>,
-          %w:!Infrt.tensor<X86, NCHW, F32>,
-          %bias:!Infrt.tensor<X86, NCHW, F32>)
+  infrt.benchmark "add.f32"(
+          %input:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %w:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %bias:!infrt.dense_tensor<CPU, FP32, NCHW>)
           duration_secs = 100, max_count = 300000, num_warmup_runs = 3
   {
-    %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> (!Infrt.tensor<X86, NCHW, F32>)
-    Infrt.return %res : !Infrt.tensor<X86, NCHW, F32>
+    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return %res : !infrt.dense_tensor<CPU, FP32, NCHW>
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
index d55d9904b5bc4e43388abacf9e4b62bf06db458b..97781e5c8c5e544bba53b561f2adcae16263886f 100644
--- a/paddle/infrt/external_kernels/paddle.mlir
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -1,50 +1,50 @@
 // CHECK: paddle_func
 func @paddle_func() -> () {
-  %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
-  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%input : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%input : !infrt.dense_tensor<CPU, FP32, NCHW>)
   // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-  dt.print_tensor (%w : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%bias : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%w : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.matmul
-  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out1 : !Infrt.tensor<X86, NCHW, F32>)
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.elementwise_add
-  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
-  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out2 : !Infrt.tensor<X86, NCHW, F32>)
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.relu
-  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.relu"(%out1, %out3) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out3 : !Infrt.tensor<X86, NCHW, F32>)
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.sigmoid
-  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out4 : !Infrt.tensor<X86, NCHW, F32>)
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
index f343dfc71b040e77308b30c2963fb4014221e29c..4209b2a9648d8be0a9a3897c27c7a35113cba424 100644
--- a/paddle/infrt/host_context/kernel_registry.cc
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -23,8 +23,9 @@ namespace infrt {
 namespace host_context {
 
 struct KernelRegistry::Impl {
-  std::unordered_map<std::string, KernelImplementation> data;
-  std::unordered_map<std::string, llvm::SmallVector<std::string, 4>> attr_names;
+  std::unordered_map<std::string,
+                     std::pair<KernelImplementation, std::vector<const char *>>>
+      data;
 };
 
 KernelRegistry::KernelRegistry() : impl_(std::make_unique<Impl>()) {}
@@ -33,20 +34,29 @@ void KernelRegistry::AddKernel(const std::string &key,
                                KernelImplementation fn) {
   CHECK(!impl_->data.count(key)) << "kernel [" << key
                                  << "] is registered twice";
-  impl_->data.emplace(key, fn);
+  impl_->data.emplace(
+      key, std::make_pair(std::move(fn), std::vector<const char *>{}));
 }
 
-void KernelRegistry::AddKernelAttrNameList(
-    const std::string &key, const std::vector<std::string> &names) {
-  CHECK(!impl_->attr_names.count(key))
-      << "kernel [" << key << "] is registered twice in attribute names";
-  impl_->attr_names.emplace(
-      key, llvm::SmallVector<std::string, 4>(names.begin(), names.end()));
+const std::vector<const char *> &KernelRegistry::GetAttrNameList(
+    const std::string &key) const {
+  CHECK(impl_->data.count(key));
+  return impl_->data[key].second;
+}
+
+void KernelRegistry::AddKernelWithAttrs(
+    const std::string &key,
+    KernelImplementation fn,
+    std::vector<const char *> &&attr_order) {
+  CHECK(!impl_->data.count(key)) << "kernel [" << key
+                                 << "] is registered twice";
+  impl_->data.emplace(key,
+                      std::make_pair(std::move(fn), std::move(attr_order)));
 }
 
 KernelImplementation KernelRegistry::GetKernel(const std::string &key) const {
   auto it = impl_->data.find(key);
-  return it != impl_->data.end() ? it->second : KernelImplementation{};
+  return it != impl_->data.end() ? it->second.first : KernelImplementation{};
 }
 
 std::vector<std::string> KernelRegistry::GetKernelList() const {
diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h
index a813f690efb0b3d36b7575d0889652f0868a2d85..a146b2b3c4c1e1090b5ac1843466b93a31b0bb0b 100644
--- a/paddle/infrt/host_context/kernel_registry.h
+++ b/paddle/infrt/host_context/kernel_registry.h
@@ -34,10 +34,14 @@ class KernelRegistry {
   KernelRegistry();
 
   void AddKernel(const std::string &key, KernelImplementation fn);
-  void AddKernelAttrNameList(const std::string &key,
-                             const std::vector<std::string> &names);
+  void AddKernelWithAttrs(const std::string &key,
+                          KernelImplementation fn,
+                          std::vector<const char *> &&attrs_order);
 
   KernelImplementation GetKernel(const std::string &key) const;
+  const std::vector<const char *> &GetAttrNameList(
+      const std::string &key) const;
+
   std::vector<std::string> GetKernelList() const;
 
   size_t size() const;
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 90bcb1df220c0f4c558ece80a09fccc93aada41c..81bf873ddf0cf3f1a94489bd3b0b2769274b1b4a 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -30,10 +30,13 @@
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
-#endif
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
@@ -62,6 +65,9 @@ int main(int argc, char** argv) {
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
   kernel::RegisterInferShapeLaunchers(&registry);
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+  kernel::RegisterTrtKernels(&registry);
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
 #endif
 
   // load extra shared library
@@ -92,7 +98,7 @@ int main(int argc, char** argv) {
   std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
                                              infrt::PrecisionType::FLOAT32,
                                              infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
   phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
 #endif
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 47ec27ebec300f1cedd57b11e0dd1e6b37611141..ec8d43f99bae770f28cbf1b1bdc269536b4e7100 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -43,6 +43,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
                func_op.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(&func_op.getRegion()),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -54,6 +55,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
     : Function("", func_type.getNumInputs(), func_type.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(region),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -90,7 +92,7 @@ void MlirFunctionExecutable::BuildExecutables(
 
     if (EmitCallOp(&op, &function_table_)) continue;
 
-    if (EmitGeneralOp(&op)) continue;
+    if (EmitGeneralOp(&op, *kernel_registry_)) continue;
     LOG(FATAL) << "Not supported op: " << DumpToString(op);
   }
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index a6428df86e6b27061d92856970682bc29499d825..cd9161d01bbf648c344ec2a82747d997b810856a 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -70,6 +70,7 @@ class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator {
 
  private:
   mlir::Region* region_{};
+  KernelRegistry* kernel_registry_{};
   CoreRuntimeBuilder core_runtime_builder_;
   MlirToRuntimeTranslator::function_defs_t& function_table_;
   std::function<void()> copy_res_fn_;
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
index 1b55b408f2b082c09d06d51037e8c9d967a171f4..263d5884134b143aa8d3403c5cd05672df39636f 100644
--- a/paddle/infrt/host_context/mlir_tests/basic.mlir
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 2
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
   // CHECK: 3
-  "Infrt.print.f32"(%v2) : (f32) -> ()
+  "infrt.print.f32"(%v2) : (f32) -> ()
 
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
   // CHECK: 6
-  "Infrt.print.f32"(%v3) : (f32) -> ()
+  "infrt.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
 
 // CHECK-LABEL: basic1
 // Check the mlir executor can work with more than one function in a file.
 func @basic1() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  %v0 = infrt.constant.f32 1.0
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 1
-  Infrt.return
+  infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
index 5a973a3eb23e6015ede2d69d83ab8c26de669908..1a7fa28f1e58bd400671099f5af7bedbb3c04e4d 100644
--- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -1,9 +1,9 @@
 // CHECK-LABEL: build_tensor1
 func @build_tensor1() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
index 22df1c8010d8dbd6a4b8e332e01602b4421ebcdd..691ce62cbf82ad4dc0d3b0199a9c1d1127213de5 100644
--- a/paddle/infrt/host_context/mlir_tests/shape.mlir
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -3,5 +3,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
-}
\ No newline at end of file
+  infrt.return
+}
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index a901c323ec03a418a32eee3cb8ea17708e38bdb9..bcd44540b336eee6d9a76fc14057e8454b9ae329 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,12 +16,14 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
+#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -42,6 +44,13 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace host_context {
 
@@ -75,7 +84,7 @@ struct MlirToRuntimeTranslator::Impl {
 };
 
 bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
-  if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant"))
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
     return false;
   VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
           << "]";
@@ -267,109 +276,153 @@ boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 static bool IsReturn(mlir::Operation* op) {
-  return op->getName().getStringRef() == "Infrt.return";
+  return op->getName().getStringRef() == "infrt.return";
 }
 
-bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
+bool MlirToRuntimeTranslator::EmitGeneralOp(
+    mlir::Operation* op, const KernelRegistry& kernel_registry) {
   CHECK(impl_->runtime);
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
 
   VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+  // TODO(wilber): Find a more appropriate way to handle special cases.
+  if (op->getName().getStringRef() == "trt.create_engine") {
+#ifdef INFRT_WITH_TRT
+    auto* symbols = impl_->runtime->symbol_table();
+    ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
+    mlir_operation.operation = op;
+    mlir_operation.symbol_table = symbols;
+    impl_->cur_op->AppendArgument(new Value(mlir_operation));
+    // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
+    // add a naive implement.
+    for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+      auto operand = op->getOperand(i);
+      Value* arg_value{nullptr};
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        arg_value = GetValue(arg);
+      } else {
+        arg_value = GetValue(operand);
+        if (!arg_value) {
+          auto upstream_op = operand.getDefiningOp();
+          arg_value = GetOpResult(upstream_op);
+        }
+      }
+      if (arg_value->is_type<phi::DenseTensor>()) {
+        impl_->runtime->FeedInArgs(
+            std::make_pair(std::to_string(i), ValueRef(arg_value)));
+      }
+    }
+#else
+    CHECK(false) << "should not reach here";
+#endif
+  } else {
+    // process operands
+    for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+      // function argument as value
+      auto operand = op->getOperand(i);
+      /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        Value* arg_value = GetValue(arg);
+        impl_->cur_op->AppendArgument(arg_value);
+        VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+                << GetValue(arg);
+        continue;
+      }
 
-  // process operands
-  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
-    // function argument as value
-    auto operand = op->getOperand(i);
-    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
-    if (operand.isa<mlir::BlockArgument>()) {
-      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
-      Value* arg_value = GetValue(arg);
+      // normal value
+      Value* arg_value = GetValue(operand);
+      if (!arg_value) {
+        auto upstream_op = operand.getDefiningOp();
+        arg_value = GetOpResult(upstream_op);
+      }
+      CHECK(arg_value) << "No-exist argument value found: "
+                       << DumpToString(operand);
       impl_->cur_op->AppendArgument(arg_value);
-      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
-              << GetValue(arg);
-      continue;
-    }
 
-    // normal value
-    Value* arg_value = GetValue(operand);
-    if (!arg_value) {
-      auto upstream_op = operand.getDefiningOp();
-      arg_value = GetOpResult(upstream_op);
+      VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+              << GetValue(operand) << " vs " << arg_value;
     }
-    CHECK(arg_value) << "No-exist argument value found: "
-                     << DumpToString(operand);
-    impl_->cur_op->AppendArgument(arg_value);
-
-    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
-            << GetValue(operand) << " vs " << arg_value;
   }
 
   // process attributes
   auto attrs = op->getAttrs();
 
+  // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its
+  // elements
+  // are sorted by name. The following code adapts the order of function
+  // signatures
+  // of the phi operator library.
+  llvm::SmallVector<Value*, 4> tmp;
+  tmp.resize(attrs.size());
+  const std::string& kernel_name = op->getName().getStringRef().str();
+  const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name);
+  if (attrs.size() && attr_names.empty()) {
+    LOG(WARNING) << "The kernel `" << kernel_name
+                 << "` has no specified attr order.";
+  }
+  auto get_offset = [](const char* attr,
+                       const std::vector<const char*>& names,
+                       const std::string& kernel_name) -> int {
+    for (size_t i = 0; i < names.size(); ++i) {
+      if (!std::strcmp(attr, names[i])) {
+        return i;
+      }
+    }
+    LOG(WARNING) << "The attribute `" << attr << "` of kernel `" << kernel_name
+                 << "` is not properly registered with "
+                    "`KernelRegistry::AddKernelWithAttrs()`.";
+    return -1;
+  };
+
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
+    int offset{};
+    if (attr_names.size()) {
+      offset = get_offset(attr.getName().data(), attr_names, kernel_name);
+    } else {
+      offset = i;
+    }
+    CHECK_NE(offset, -1);
     if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<float>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<double>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v =
                    EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else {
       LOG(FATAL) << "Not supported attribute type";
     }
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    if (res.getType().isa<::infrt::DenseTensorType>()) {
-      auto r = impl_->value_map.try_emplace(
-          res, ValueRef(new Value{::phi::DenseTensor()}));
-      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
-                      << "]";
-      res_values.push_back(r.first->second.get());
-    } else {
-      res_values.push_back(AddValue(res));
-    }
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
+  for (size_t i = 0; i < tmp.size(); i++) {
+    impl_->cur_op->AppendAttribute(tmp[i]);
   }
-#endif
 
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
@@ -399,13 +452,40 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   return true;
 }
 
 bool MlirToRuntimeTranslator::EmitReturnOp(
     mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
   CHECK(results);
-  if (op->getName().getStringRef() == "Infrt.return") {
+  if (op->getName().getStringRef() == "infrt.return") {
     for (size_t i = 0; i < op->getNumOperands(); i++) {
       results->push_back(op->getOperand(i));
     }
@@ -478,7 +558,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
                                          function_defs_t* function_table) {
   CHECK(op);
   CHECK(function_table);
-  if (op->getName().getStringRef() != "Infrt.call") return false;
+  if (op->getName().getStringRef() != "infrt.call") return false;
 
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
@@ -598,7 +678,7 @@ class MlirProgramTestExecutor : public MlirToRuntimeTranslator {
         llvm::SmallVector<mlir::Value, 3> results;
         if (EmitReturnOp(&op, &results)) continue;
         if (EmitCallOp(&op, &impl_->func_defs)) continue;
-        if (EmitGeneralOp(&op)) continue;
+        if (EmitGeneralOp(&op, *registry)) continue;
         LOG(FATAL) << "Not supported op: " << DumpToString(op);
       }
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 0c453651d9e6dc44adaf108ec6a1b0df984fe8be..27a7f20168667daddd353e902d49479aa612e38f 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -57,13 +57,14 @@ class MlirToRuntimeTranslator {
  protected:
   //! Emit a "infrt.constant.*" operation, return true if succeed.
   bool EmitConstantOp(mlir::Operation* op);
-  //! Emit a "Infrt.return" operation.
+  //! Emit a "infrt.return" operation.
   bool EmitReturnOp(mlir::Operation* op,
                     llvm::SmallVectorImpl<mlir::Value>* results);
   //! Emit a "ts.build_shape" operation.
   bool EmitBuildShapeOp(mlir::Operation* op);
   //! Emit an operation other than the special cases above.
-  bool EmitGeneralOp(mlir::Operation* op);
+  bool EmitGeneralOp(mlir::Operation* op,
+                     const KernelRegistry& kernel_registry);
   //! Emit all the functions.
   bool EmitFunctions();
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 5824e40abf97a4d63543948d056e815bbeebce3a..31615fbc3f6e46f55ddc5f56641750feb0972772 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -63,14 +63,14 @@ TEST(TestMlir, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -101,7 +101,7 @@ func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<
       "!infrt.dense_tensor<CPU, FP32, NCHW>";
 
   auto end = R"ROC(
-Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
   )ROC";
 
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 6afef5935c73450b4865c0e02593aa372299c95f..29328520212fd4d020afc28c1e48d2db604414bc 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -13,16 +13,17 @@
 // limitations under the License.
 
 #include "paddle/infrt/host_context/paddle_mlir.h"
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 
 MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
-  context_->allowUnregisteredDialects();
   context_->getOrLoadDialect<mlir::StandardOpsDialect>();
-  context_->getOrLoadDialect<infrt::dialect::INFRTDialect>();
   context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
   context_->getOrLoadDialect<infrt::dt::DTDialect>();
-  context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
+  context_->getOrLoadDialect<infrt::pd::PaddleDialect>();
+  context_->getOrLoadDialect<::infrt::InfrtDialect>();
   module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_));
 }
 
@@ -56,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
-
   return module_;
 }
 
@@ -91,11 +91,15 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
+
           operandTypes.push_back(type_);
         }
       }
@@ -117,11 +121,14 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelOutputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
           resultTypes.push_back(type_);
         }
       }
@@ -168,11 +175,11 @@ void MLIRModelGenImpl::UpdateModelParams(
       auto name = builder_.getStringAttr(var_desc.name());
       std::vector<int64_t> dims = RepeatedToVector<int64_t>(
           var_desc.type().lod_tensor().tensor().dims());
-      mlir::Type precision_;
-      ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                      builder_,
-                      &precision_);
-      mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+      infrt::PrecisionType precision_;
+      ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                           &precision_);
+      mlir::Type type_ = infrt::DenseTensorType::get(
+          context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY);
       auto op = builder_.create<infrt::dt::TensorMapGetTensorOp>(
           mlir::UnknownLoc::get(context_), type_, map, name);
       params_map_.insert(std::pair<std::string, mlir::Value>(
@@ -198,8 +205,9 @@ void MLIRModelGenImpl::UpdateModelOutputs(
 
         llvm::SmallVector<mlir::Type, 4> resultTypes;
         llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+
         mlir::OperationState state(loc,
-                                   mlir::ReturnOp::getOperationName(),
+                                   ::infrt::ReturnOp::getOperationName(),
                                    operands,
                                    resultTypes,
                                    attrs);
@@ -257,11 +265,13 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
       if (var_desc.name() == var_name) {
         std::vector<int64_t> dims = RepeatedToVector<int64_t>(
             var_desc.type().lod_tensor().tensor().dims());
-        mlir::Type precision_;
-        ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                        builder_,
-                        &precision_);
-        mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+        infrt::PrecisionType precision_;
+        ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                             &precision_);
+        mlir::Type type_ = infrt::DenseTensorType::get(context_,
+                                                       infrt::TargetType::CPU,
+                                                       precision_,
+                                                       infrt::LayoutType::ANY);
         resultTypes.push_back(type_);
       }
     }
@@ -322,7 +332,7 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
     switch (type) {
       ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr);
       ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr);
-      ATTR_IMPL_CASE(INT, i, getI32IntegerAttr);
+      ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr);
       ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr);
       ATTR_IMPL_CASE(STRING, s, getStringAttr);
 
@@ -398,3 +408,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
       return false;
   }
 }
+
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type) {
+  switch (dtype) {
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16:
+      *type = infrt::PrecisionType::FLOAT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32:
+      *type = infrt::PrecisionType::FLOAT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64:
+      *type = infrt::PrecisionType::FLOAT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL:
+      *type = infrt::PrecisionType::BOOL;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8:
+      *type = infrt::PrecisionType::INT8;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16:
+      *type = infrt::PrecisionType::INT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32:
+      *type = infrt::PrecisionType::INT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64:
+      *type = infrt::PrecisionType::INT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8:
+      *type = infrt::PrecisionType::UINT8;
+      return true;
+    default:
+      return false;
+  }
+}
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 78dfefcfda2c83760492766507999322152187eb..a351b5cf80e2356a6481ccd302a544dcfe595e05 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -14,22 +14,22 @@
 #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/MLIRContext.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/MLIRContext.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/common/string.h"
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/init_dialects.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/paddle/model_parser.h"
 
@@ -102,4 +102,7 @@ inline std::vector<T> RepeatedToVector(
 bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
                      mlir::Builder builder,
                      mlir::Type *type);
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type);
+
 #endif  // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 86df3508cf813628b4a8ba8412ce93d6b1dfc5a2..1f0b1dabd94d8dcf28e8e0543a8e3b12ed250704 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,8 +22,9 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/infrt/tensor/dense_tensor_view.h"
@@ -41,7 +42,15 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
-#endif
+
+#ifdef INFRT_WITH_GPU
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif  // INFRT_WITH_GPU
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif  // INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 namespace infrt {
 namespace host_context {
@@ -72,8 +81,13 @@ using ValueVariantType =
             ::phi::MetaTensor,
             ::phi::DenseTensor,
             backends::CpuPhiContext,
+#ifdef INFRT_WITH_GPU
+            backends::GpuPhiContext,
+            ::phi::GPUContext,
+#endif
             ::phi::CPUContext,
             std::vector<const phi::DenseTensor*>,
+            std::vector<phi::DenseTensor*>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
             std::vector<phi::MetaTensor*>,
@@ -81,6 +95,10 @@ using ValueVariantType =
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
             paddle::experimental::DataType,
+#ifdef INFRT_WITH_TRT
+            ::infrt::backends::tensorrt::TrtEngine,
+            ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
+#endif  // INFRT_WITH_TRT
 #endif
             std::vector<int16_t>,
             std::vector<int32_t>,
@@ -120,8 +138,18 @@ class Value : public common::Object {
 #ifdef INFRT_WITH_PHI
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
   explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_GPU
+  explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
+#endif
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_TRT
+  explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
+      : data(std::move(x)) {}
+  explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
+      : data(x) {}
+#endif  // INFRT_WITH_TRT
 #endif
 
   template <typename T>
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index f1cbfba1c46b33e461a7c9f08cf646625fbafb24..f20344f6f6b84ae8e63f44c7b7b83c6ba9d8d6da 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(phi)
+add_subdirectory(tensorrt)
 
 core_gather_headers()
 
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index 23e50a5ddc87427bbf0f49c559f185084e42c8ec..b186cfcfd2b355f97711ecc916e497c2916d4060 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -63,24 +63,24 @@ static void PrintString(const std::string &str) {
 void RegisterBasicKernels(host_context::KernelRegistry *registry) {
   RegisterIntBasicKernels(registry);
   RegisterFloatBasicKernels(registry);
-  registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString));
-  registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString));
+  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
 }
 
 void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add<int32_t>));
-  registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
-  registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
-  registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div<int32_t>));
-  registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
 }
 
 void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add<float>));
-  registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub<float>));
-  registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul<float>));
-  registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div<float>));
-  registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print<float>));
+  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
index 8b18aca0210860f4ae688f2133ffa022fda3195d..6cc94dbcce0775cb6b74f993bfdd262fd6a47e6f 100644
--- a/paddle/infrt/kernel/control_flow_kernels.cc
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -37,7 +37,7 @@ static void INFRTCall(
 }
 
 void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall));
+  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 39ef172fadef9e0f6317dec192c251c6a1df6828..b27eacf9e522d2bbb8b7ffd70ad57f54e5775499 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -25,6 +25,16 @@ namespace phi {
   return ctx;
 }
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext() {
+  ::phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
+  context.PartialInitWithAllocator();
+  return context;
+}
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 3e9580b91da5724b42c72224847e45715f47dbb7..ae3f76c8fe536f96689680668cc52e4981894063 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -25,6 +25,10 @@ namespace phi {
 
 ::phi::CPUContext CreateCPUContext();
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext();
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index e89ee7cfe5d6f51b3206aecc6ca283e06c0e5561..6d16b814c6b02b08e279190d5a685d65c124942d 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -15,6 +15,12 @@
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/place.h"
+
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
 
 namespace infrt {
 namespace kernel {
@@ -23,41 +29,98 @@ namespace phi {
 ::phi::DenseTensor CreateDenseTensor(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
+}
+
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision) {
   return ::phi::DenseTensor(
       const_cast<::phi::Allocator*>(&context.GetAllocator()),
-      ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
                              ::phi::make_ddim(dims.get()),
-                             cvtLayout2Phi(layout.get()),
+                             ConvertLayoutToPhi(layout.get()),
                              {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<float>> values) {
-  auto place = ::phi::CPUPlace();
+                        host_context::Attribute<std::vector<float>> value) {
+  auto place = dense_tensor->place();
   float* a_data = dense_tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
-    a_data[i] = (values.get())[i];
+  if (place.GetType() == ::phi::AllocationType::CPU) {
+    for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+      a_data[i] = (value.get())[i];
+    }
+  } else if (place.GetType() == ::phi::AllocationType::GPU) {
+#ifdef INFRT_WITH_GPU
+    // TODO(wilber): how to set the stream parameter to copy with stream.
+    cudaMemcpy(a_data,
+               value.get().data(),
+               sizeof(float) * value.get().size(),
+               cudaMemcpyHostToDevice);
+#endif
+  } else {
+    llvm_unreachable("temporarily not support other target.");
   }
 }
 
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
-#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
-  case ::phi::DataType::PHI_DATATYPE: {                   \
-    DTYPE* data = dense_tensor->data<DTYPE>();            \
-    if (dense_tensor->numel() == 0) break;                \
-    std::cout << data[0];                                 \
-    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
-      std::cout << "," << data[i];                        \
-    }                                                     \
-    break;                                                \
+#ifndef INFRT_WITH_GPU
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                \
+  case ::phi::DataType::PHI_DATATYPE: {                     \
+    auto place = dense_tensor->place();                     \
+    if (place.GetType() == ::phi::AllocationType::CPU) {    \
+      DTYPE* data = dense_tensor->data<DTYPE>();            \
+      if (dense_tensor->numel() == 0) break;                \
+      std::cout << data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+        std::cout << "," << data[i];                        \
+      }                                                     \
+    }                                                       \
+    break;                                                  \
+  }
+#else
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                     \
+  case ::phi::DataType::PHI_DATATYPE: {                          \
+    auto place = dense_tensor->place();                          \
+    DTYPE* data = dense_tensor->data<DTYPE>();                   \
+    if (dense_tensor->numel() == 0) break;                       \
+    if (place.GetType() == ::phi::AllocationType::CPU) {         \
+      std::cout << data[0];                                      \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << data[i];                             \
+      }                                                          \
+    } else if (place.GetType() == ::phi::AllocationType::GPU) {  \
+      std::vector<DTYPE> host_data(dense_tensor->numel(), 0);    \
+      cudaMemcpy(host_data.data(),                               \
+                 data,                                           \
+                 sizeof(DTYPE) * dense_tensor->numel(),          \
+                 cudaMemcpyDeviceToHost);                        \
+      std::cout << host_data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << host_data[i];                        \
+      }                                                          \
+    } else {                                                     \
+      llvm_unreachable("temporarily not support other target."); \
+    }                                                            \
+    break;                                                       \
   }
+#endif
 
   ::phi::DDim dims = dense_tensor->dims();
   std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
-            << " values=[";
+            << " value=[";
   switch (dense_tensor->dtype()) {
     PRINT_META_DATA(FLOAT32, float);
     PRINT_META_DATA(INT32, int32_t);
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 187e5c64511e83556bec50f4368ae7cbe89dda90..47d89506e2aa615b0bc425a4c373c904d937e03f 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -26,8 +26,15 @@ namespace phi {
 ::phi::DenseTensor CreateDenseTensor(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
+
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 90570484179d1e555f86c55ea0e8ac4f9bc83c53..36d40118f16a0bd1779765064caaac6dbe414772 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -34,12 +34,25 @@ namespace kernel {
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("phi_dt.create_context.cpu",
                       INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
-  registry->AddKernel("phi_dt.create_dense_tensor",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor));
-  registry->AddKernel("phi_dt.fill_dense_tensor.f32",
-                      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.cpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.fill_dense_tensor.f32",
+      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32),
+      {"value"});
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
+
+#ifdef INFRT_WITH_GPU
+  registry->AddKernel("phi_dt.create_context.gpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.gpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+#endif
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index d5922af9ada1f4983fe14df87c09180fe17fda19..a9077220cfc709116479a5d91b39d56ad4007af8 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,6 +25,10 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace kernel {
 using namespace host_context;  // NOLINT
@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
 
+// TODO(wilber): Maybe we should place TensorList type in dt dialect.
+#ifdef INFRT_WITH_PHI
+phi::DenseTensor TensorListGetTensor(std::vector<phi::DenseTensor *> list,
+                                     Attribute<int32_t> idx) {
+  CHECK_LT(idx.get(), static_cast<int>(list.size()))
+      << "idx should less than list size";
+  return *list[idx.get()];
+}
+
+int32_t TensorListGetSize(const std::vector<phi::DenseTensor *> &list) {
+  return list.size();
+}
+#endif
+
 DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
 
 template <typename T>
@@ -111,9 +129,9 @@ void NaiveMatmul(const DenseHostTensor &x,
 /// ===== Kernel end ====
 
 void RegisterTensorKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("dt.create_uninit_tensor.f32",
-                      INFRT_KERNEL(CreateUninitTensor<float>));
-  registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"});
+  registry->AddKernelWithAttrs("dt.create_uninit_tensor.f32",
+                               INFRT_KERNEL(CreateUninitTensor<float>),
+                               {"shape"});
   registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
   registry->AddKernel("dt.fill_tensor_with_constant.f32",
                       INFRT_KERNEL(FillTensorWithConstant<float>));
@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(TensorMapGetTensor));
   registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
 
+// TensorList related methods.
+#ifdef INFRT_WITH_PHI
+  registry->AddKernelWithAttrs(
+      "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"});
+  registry->AddKernel("dt.tensor_list_get_size",
+                      INFRT_KERNEL(TensorListGetSize));
+#endif
+
   registry->AddKernel("dt.shallow_copy_tensor",
                       INFRT_KERNEL(ShallowCopyTensor));
 
diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd35fccbe2aa35453a4d4ac13364ef6bb5a6b6aa
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
+  return()
+endif()
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    registry.cc
+    trt_kernels.cc
+)
diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a37e3c0f7f2785e23c8a0b9a25d3283396215f70
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+
+namespace infrt {
+namespace kernel {
+
+void RegisterTrtKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("trt.create_engine",
+                      INFRT_KERNEL(tensorrt::CreateTrtEngine));
+  registry->AddKernel("trt.inspect_engine",
+                      INFRT_KERNEL(tensorrt::PrintTrtLayer));
+  registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..762329ca61d02a16edc150854afcc3dd431a941d
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+}  // namespace host_context
+}  // namespace infrt
+
+namespace infrt {
+namespace kernel {
+
+/**
+ * Register all the trt kernels to registry.
+ */
+void RegisterTrtKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..96122bffacdb2251c28e311ae02fe6f9c5319615
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+
+#include "glog/logging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) {
+  switch (tensor_type) {
+    case phi::DataType::FLOAT32:
+      return nvinfer1::DataType::kFLOAT;
+    case phi::DataType::INT32:
+      return nvinfer1::DataType::kINT32;
+    case phi::DataType::FLOAT16:
+      return nvinfer1::DataType::kHALF;
+    default:
+      llvm_unreachable("should not reach here");
+  }
+}
+
+static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) {
+  nvinfer1::Dims dims;
+  dims.nbDims = int_array_attr.size();
+  CHECK(!int_array_attr.empty());
+  CHECK(int_array_attr[0].getType().isIntOrIndex());
+  for (int i = 0; i < dims.nbDims; ++i) {
+    dims.d[i] = int_array_attr[i].cast<mlir::IntegerAttr>().getInt();
+  }
+  return dims;
+}
+
+static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) {
+  CHECK_NOTNULL(tensor);
+  nvinfer1::Weights ret;
+  ret.type = TensorTypeToWeightType(tensor->dtype());
+  ret.count = tensor->numel();
+  ret.values = tensor->data();
+  return ret;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa7609092b82c8ab08b75bfbd3e252801cc79c7d
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#include <string>
+#include "NvInfer.h"
+#include "NvInferRuntime.h"
+#include "NvInferRuntimeCommon.h"
+#include "glog/logging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+#include "paddle/infrt/kernel/tensorrt/trt_layers.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol create_engine_op) {
+  // TODO(wilber): The device_id needs to get from mlir.
+  int device_id = 0;
+  backends::tensorrt::TrtEngine engine(device_id);
+
+  auto* builder = engine.GetTrtBuilder();
+  // TODO(wilber): How to process weights?
+  backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  // TODO(wilber): static_shape or dynamic_shape network? The code is just
+  // static_shape test.
+  network.reset(builder->createNetworkV2(0));
+
+  // TODO(wilber): The build option shoule be fiiled from mlir info.
+  backends::tensorrt::BuildOptions options;
+  options.max_batch = 4;
+  options.workspace = 1024;
+
+  // Parse mlir Region which only has one block.
+  mlir::Operation& operation = *create_engine_op.operation;
+  auto* symbol_table = create_engine_op.symbol_table;
+  CHECK_NOTNULL(symbol_table);
+
+  unsigned int num_regions = operation.getNumRegions();
+  CHECK_EQ(num_regions, 1U) << "only support one region case.";
+  auto& region = operation.getRegion(0);
+  auto& block = region.getBlocks().front();
+
+  std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
+  ValueToITensorMap value_to_trt_tensor_map;
+  ValueToTensorMap value_to_tensor_map;
+
+  for (auto index_operand : llvm::enumerate(operation.getOperands())) {
+    mlir::Value operand = index_operand.value();
+    size_t idx = index_operand.index();
+
+    const std::string input_name = "input_" + std::to_string(idx);
+    auto* v = symbol_table->GetValue(std::to_string(idx));
+    CHECK_NOTNULL(v);
+    auto* t = &v->get<phi::DenseTensor>();
+    value_to_tensor_map[operand] = t;
+
+    // TODO(wilber): get input info from mlir.
+
+    // TODO(wilber): input dims, now only support static_shape, and just remove
+    // the first dimension. If the first dim is not -1, maybe we can pass the
+    // origin dims.
+
+    // TODO(wilber): now only suppot float input.
+
+    if (operand.isa<mlir::BlockArgument>()) {
+      // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU
+      // tensor, so we treat all GPU tensors as inputs to trt.
+      if (t->place().GetType() == phi::AllocationType::GPU) {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    } else {
+      // TODO(wilber): Replace with the op name that generates the weights.
+      if (operand.getDefiningOp()->getName().getStringRef() !=
+          "phi_dt.create_dense_tensor.cpu") {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    }
+  }
+
+  // TODO(wilber): Find a way to add layer.
+  for (auto& operation : block.without_terminator()) {
+    if (trt::ActivationOp op = llvm::dyn_cast<trt::ActivationOp>(operation)) {
+      ActivationFunc(
+          op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::FullyConnectedOp op =
+                   llvm::dyn_cast<trt::FullyConnectedOp>(operation)) {
+      FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::ConvolutionOp op =
+                   llvm::dyn_cast<trt::ConvolutionOp>(operation)) {
+      ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else {
+      CHECK(false) << "not supported operation.";
+    }
+  }
+
+  for (auto index_operand :
+       llvm::enumerate(block.getTerminator()->getOperands())) {
+    mlir::Value arg = index_operand.value();
+    CHECK(value_to_trt_tensor_map.count(arg));
+    // TODO(wilber): A trick that we name trt output tensor's name as output_0,
+    // output_1, ...
+    value_to_trt_tensor_map[arg]->setName(
+        ("output_" + std::to_string(index_operand.index())).c_str());
+    network->markOutput(*value_to_trt_tensor_map[arg]);
+  }
+  for (int i = 0; i < network->getNbOutputs(); ++i) {
+    engine.PrepareOutputHandle(network->getOutput(i)->getName());
+  }
+
+  VLOG(3) << "trt engine build start.";
+  engine.Build(std::move(network), options);
+  VLOG(3) << "trt engine build done.";
+
+  // TODO(wilber): get inference options from mlir.
+  backends::tensorrt::InferenceOptions inference_options;
+  inference_options.batch = 1;
+  // TODO(wilber): bind trt input/output tensors.
+  engine.SetUpInference(inference_options, trt_bind_inputs);
+  return engine;
+}
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
+  engine->GetEngineInfo();
+}
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
+  engine->Run(context);
+  std::vector<phi::DenseTensor*> res;
+  for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
+    res.push_back(engine->GetOutput("output_" + std::to_string(i)));
+  }
+  return res;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..546ee9dc78852e6967bf8b61ae81563d32beae66
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/Operation.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace infrt {
+namespace host_context {
+class SymbolTable;
+}  // namespace host_context
+
+namespace kernel {
+namespace tensorrt {
+
+struct MlirOperationWithInfrtSymbol {
+  mlir::Operation* operation;
+  ::infrt::host_context::SymbolTable* symbol_table;
+};
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol engine_op);
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h
new file mode 100644
index 0000000000000000000000000000000000000000..19e20c170ec835444a5a37818b837dafb096b2b8
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <mlir/IR/Operation.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+using ValueToTensorMap = llvm::DenseMap<mlir::Value, phi::DenseTensor*>;
+using ValueToITensorMap = llvm::DenseMap<mlir::Value, nvinfer1::ITensor*>;
+
+inline void ActivationFunc(
+    trt::ActivationOp& act_op,  // NOLINT
+    nvinfer1::INetworkDefinition* network,
+    ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+    ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  auto in_arg = act_op.getOperand();
+  CHECK(value_to_trt_tensor_map.count(in_arg))
+      << "value_to_trt_tensor_map not has in_arg.";
+
+  nvinfer1::ActivationType act_type =
+      static_cast<nvinfer1::ActivationType>(act_op.activation_type());
+  auto* act_layer =
+      network->addActivation(*value_to_trt_tensor_map[in_arg], act_type);
+  act_layer->setAlpha(act_op.alpha().convertToFloat());
+  act_layer->setBeta(act_op.beta().convertToFloat());
+  for (size_t i = 0; i < act_op->getNumResults(); ++i) {
+    nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
+    mlir::Value act_out = act_op->getResult(i);
+    value_to_trt_tensor_map[act_out] = act_out_tensor;
+  }
+}
+
+inline void ConvFunc(trt::ConvolutionOp& op,  // NOLINT
+                     nvinfer1::INetworkDefinition* network,
+                     ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                     ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  int out_channel_num = op.out_channel_num();
+  auto size_attrs = op.kernel_size();
+  nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs);
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  auto* layer =
+      network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr],
+                                out_channel_num,
+                                dims,
+                                kernel_weights,
+                                bias_weights);
+  CHECK_NOTNULL(layer);
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+
+inline void FcFunc(trt::FullyConnectedOp& op,  // NOLINT
+                   nvinfer1::INetworkDefinition* network,
+                   ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                   ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  CHECK(value_to_trt_tensor_map.count(input_tensor_repr));
+
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  int out_channel_num = op.out_channel_num();
+  auto* layer =
+      network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr],
+                                 out_channel_num,
+                                 kernel_weights,
+                                 bias_weights);
+
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d15bbe221f91a87b047863121f32699175183c54..bcf475d1bc09dab8be1b7a23359e1eb935ee02e0 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -193,7 +193,7 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
 }
 
 void RegisterTestKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
   registry->AddKernel("Infrt.test.shadow_copy_tensor",
                       INFRT_KERNEL(ShadowCopyTensor));
 }
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index 5ce6d8673421ba3c53c9dad6d2fd1f20298f837a..58543a6864258bd6c0153150bb535262d9a8f00d 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,8 @@
+cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
+
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec phi-ir-exec)
+    DEPENDS infrtopt infrtexec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir
index 2d4d6f2629ec7df989499f0a2e9649c01ae8428a..f534a3aa44aac964c262465da199ac926fa0904e 100644
--- a/paddle/infrt/tests/dialect/basic.mlir
+++ b/paddle/infrt/tests/dialect/basic.mlir
@@ -1,33 +1,33 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: @basic_f32
 func @basic_f32() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK-NEXT: 3
-  "Infrt.print.f32"(%value) : (f32) -> ()
+  "infrt.print.f32"(%value) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 
 /// ================================================================
 /// @caller call the other function @callee
 func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
-  %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32
-  %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
-  Infrt.return %z1 : f32
+  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  infrt.return %z1 : f32
 }
 
 // CHECK-LABEL: @caller.add.f32
 func @caller.add.f32() -> f32 {
-  %x = Infrt.constant.f32 1.0
-  %y = Infrt.constant.f32 2.0
-  %y1 = Infrt.constant.f32 3.0
-  %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+  %x = infrt.constant.f32 1.0
+  %y = infrt.constant.f32 2.0
+  %y1 = infrt.constant.f32 3.0
+  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
 
   // CHECK-NEXT: 6
-  "Infrt.print.f32"(%z) : (f32) -> ()
-  Infrt.return %z : f32
+  "infrt.print.f32"(%z) : (f32) -> ()
+  infrt.return %z : f32
 }
 /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir
index 381fd534f6a5a09e3091203de88ebf00101074af..1a57b43499062410b346b38412a533d3edd6fbcc 100644
--- a/paddle/infrt/tests/dialect/benchmark.mlir
+++ b/paddle/infrt/tests/dialect/benchmark.mlir
@@ -12,13 +12,13 @@ func @benchmark() {
   // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
   // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
   // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
-  Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
   {
-    %0 = Infrt.constant.f32 1.0
-    %1 = Infrt.constant.f32 2.0
-    %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32
-    "Infrt.print.f32"(%res) : (f32) -> ()
-    Infrt.return %res : f32
+    %0 = infrt.constant.f32 1.0
+    %1 = infrt.constant.f32 2.0
+    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "infrt.print.f32"(%res) : (f32) -> ()
+    infrt.return %res : f32
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir
index faade62d35063b1d85c4c1d3ddad98b085a7726c..6dc9904610477139b6c254d0f9f7b754041a83cc 100644
--- a/paddle/infrt/tests/dialect/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/dense_tensor.mlir
@@ -4,14 +4,14 @@ func @dense_shape0() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -19,6 +19,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 1cae065bd5fb6a6a1aa06b4cd6605a240917b55f..936c8f32c01521817e185fa80e836018e7b02aa8 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: @predict
-func @predict(%input:!Infrt.tensor<X86, NCHW, F32>, %map: !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>) {
-  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor<X86, NCHW, F32>
-  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor<X86, NCHW, F32>
+func @predict(%input:!infrt.dense_tensor<CPU, FP32, NCHW>, %map: !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // fc
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  //dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @main
 func @main() {
-  %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
   // CHECK-LABEL: loading params
   %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"}
 
-  %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  %out = infrt.call @predict(%input, %map): (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index 48ee4b9d725c0aa36d4849c2842c99997de5c8ee..4b8055514936417dd83a6bb23afaea31eb2d1013 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -5,5 +5,5 @@ func @ops() {
   %b = pd.feed() {name="input1"}: tensor<?xf32>
   %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir
similarity index 97%
rename from paddle/infrt/tests/dialect/rewrite.mlir
rename to paddle/infrt/tests/dialect/pd/rewrite.mlir
index 9fbb09e22449ff98a28b9e22732351ddbbc49dd0..ea0248b9d95d28e0160192a44f4c542d50a4892d 100644
--- a/paddle/infrt/tests/dialect/rewrite.mlir
+++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt --canonicalize %s | FileCheck %s
+// RUN: infrtopt --pd-op-fuse %s | FileCheck %s
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
   %a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index e8f09f07c82c4003e23a54c7275f576f7916f853..b8cb1a5cec2a17d3f6d15036249fcf9f7f711948 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -3,14 +3,14 @@
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
   %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor" (%ctx) {
+  %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {
     precision=#infrt.precision<FP32>, 
     layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  // CHECK: dense_tensor: shape=shape[1], values=[1]
+  // CHECK: dense_tensor: shape=shape[1], value=[1]
   "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  Infrt.return
+  infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 61a66cb3d71a372bcd67cb96362abcb033768e4d..47badd97d37db578ec36f496b21212d73fd9920e 100644
--- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,5 @@
-// RUN: phi-ir-exec %s
+// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s
+
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
@@ -8,3 +9,10 @@ func @ops() {
   %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
   "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
 }
+
+// CHECK-LABEL: @op_execute
+func @op_execute(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>, %c:!infrt.lod_tensor<?xf32,0>)  -> !infrt.lod_tensor<?xf32,0> {
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
+  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
+  "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
index 923f4e9d9d2ce6f6a24f91f04721f49712f900b5..21ee8ebf0b705894446192b0d5d0bfeb9f10f326 100644
--- a/paddle/infrt/tests/dialect/phi/phi_test.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -2,14 +2,14 @@
 module  {
   func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
     %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
-    Infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
+    infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
   }
   func @main() {
     %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
     "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-    %2 = Infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
     phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
-    Infrt.return
+    infrt.return
   }
 }
diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
index 76ae140dd6cbd741f992315ee35d3e94058d4674..47bc1f7833140c8a876660673fa11f148d42db90 100644
--- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
@@ -3,14 +3,14 @@
 func @dense_shape0() {
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -18,6 +18,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
index 52b296e06cd365fbaa1249108f877dc9f7480ff0..d6b69fdd595ea520f623e4b9651fc6e2b321c26f 100644
--- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
+++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
@@ -13,7 +13,7 @@ func @naive_elementwise_add() {
   // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
 // RUN: infrtexec -i %s | FileCheck %s
@@ -31,5 +31,5 @@ func @naive_matmul() {
   // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 28450ed6bd823f7d18eff19371a2a1a49292b329..7aeb3f8a4d0513deaed6bda73a591790b633d0db 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -3,12 +3,12 @@
 func @load_tensor_map() {
   %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
   %size = dt.tensor_map_get_size(%map) -> i32
-  Infrt.print.i32 %size
+  infrt.print.i32 %size
 
   %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2], values=[0, 0]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir
index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644
--- a/paddle/infrt/tests/dialect/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir
index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644
--- a/paddle/infrt/tests/dialect/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ef86dcf1e72a04c478a7763000cf366715665d81
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
@@ -0,0 +1,37 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%0) ({
+    %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  //%res = 
+  infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+  //-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c67d47415bfb002d2c7a91ee8b222c6227968d52
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
@@ -0,0 +1,54 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%input_tensor : !infrt.dense_tensor<GPU, FP32, NCHW>, %kernel_weight : !infrt.dense_tensor<CPU, FP32, NCHW>, %kernel_bias : !infrt.dense_tensor<CPU, FP32, NCHW>, %gpu_ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..78dc4ac1c1093c1eb9b3fb30d0ea3f0cd5be6104
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
@@ -0,0 +1,46 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+
+  %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index 6d25044d139f32c0a29adefb44c8fd2640cadd82..7bdf62a277896afe2f8a5e156fa8183742f1d853 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,16 +1,16 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
-  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+func @main(%bias:!infrt.dense_tensor<GPU, FP32, NCHW>, %c:!infrt.dense_tensor<GPU, FP32, NCHW>, %b1:!infrt.dense_tensor<GPU, FP32, NCHW>, %b2:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias1:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias2:!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW> {
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
   
-  "infrt.return"(%e2) : (tensor<?xf32>)->()
+  infrt.return %e2 : !infrt.dense_tensor<GPU, FP32, NCHW>
 }
diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd1632bc9d4d8e4e6ea0fb918d1179f4e28a441b
--- /dev/null
+++ b/paddle/infrt/tests/model/abs_model.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import Layer
+from paddle.static import InputSpec
+from paddle.jit import to_static
+import sys
+
+
+class AbsNet(paddle.nn.Layer):
+    def __init__(self):
+        super(AbsNet, self).__init__()
+
+    def forward(self, x):
+        x = paddle.abs(x)
+        return x
+
+
+if __name__ == '__main__':
+    # build network
+    model = AbsNet()
+    # save inferencing format model
+    net = to_static(
+        model, input_spec=[InputSpec(
+            shape=[None, 1, 28, 28], name='x')])
+    paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49266910dbd278fb8d429534134097751cf8b6b1
--- /dev/null
+++ b/paddle/infrt/tests/model/test_abs.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/registry.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+#include "paddle/infrt/host_context/paddle_mlir.h"
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+TEST(ABS_MODEL, convert_and_execute) {
+  std::string model_file_name = "./abs.pdmodel";
+  std::string params_file_name = "./abs.pdiparams";
+  // convert model
+  MLIRModelGenImpl myGen;
+  auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+  module_.dump();
+  // pick kernel
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  context->getOrLoadDialect<mlir::StandardOpsDialect>();
+
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::dt::DTDialect>();
+  context->getOrLoadDialect<infrt::pd::PaddleDialect>();
+
+  context->getOrLoadDialect<infrt::phi::PHIDenseTensorDialect>();
+  context->getOrLoadDialect<infrt::phi::PHICPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIGPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIDialect>();
+
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+
+  if (mlir::failed(pm.run(module_))) {
+    std::cout << "\npass failed!\n" << std::endl;
+  }
+  module_.dump();
+
+  // executate
+  infrt::host_context::KernelRegistry registry;
+  infrt::kernel::RegisterBasicKernels(&registry);
+  infrt::kernel::RegisterTestKernels(&registry);
+  infrt::kernel::RegisterTensorShapeKernels(&registry);
+  infrt::kernel::RegisterTensorKernels(&registry);
+  infrt::kernel::RegisterControlFlowKernels(&registry);
+  infrt::kernel::RegisterPhiKernels(&registry);
+  infrt::kernel::RegisterInferShapeLaunchers(&registry);
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      break;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(infrt::host_context::KernelRegistry*)>(
+              reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+  infrt::host_context::TestMlir(module_, &registry);
+}
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7b074d0ebb76d110dc361140bd42f78ef54f224b..04e1bbcc9df423bc38e78822ec6ef8ee28c5b216 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -25,8 +25,6 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-# keep this message for debug, remove it later if needless
-message(STATUS "All standard phi kernels: ${phi_kernels}")
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
 cc_library(phi DEPS ${PHI_DEPS})
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 7601696293a66d626a7fd417b32544d035921467..88660449b6821ef4cda2d1859c5551d0a00d59a6 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -86,19 +86,28 @@ class PADDLE_API CustomOpKernelContext {
   CustomOpKernelContext() = default;
 
   void EmplaceBackInput(Tensor&& input);
-  void EmplaceBackInputs(std::vector<Tensor>&& inputs);
+  void EmplaceBackInputs(const std::vector<Tensor>& inputs);
   void EmplaceBackOutput(Tensor&& output);
-  void EmplaceBackOutputs(std::vector<Tensor>&& outputs);
+  void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
   void EmplaceBackAttr(paddle::any attr);
-
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
+    attrs_ = std::move(attrs);
+  }
   const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
   const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
 
   const Tensor& InputAt(size_t idx) const;
   std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
-
+  const std::vector<paddle::any>& Attrs() const { return attrs_; }
+  const std::vector<std::pair<size_t, size_t>>& InputRange() {
+    return input_range_;
+  }
+  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
+    return output_range_;
+  }
   Tensor* MutableOutputAt(size_t idx);
   std::vector<Tensor*> MutableOutputBetweeen(size_t start, size_t end);
+  std::vector<Tensor> OutputsBetweeen(size_t start, size_t end);
   std::vector<Tensor>* AllMutableOutput();
 
   template <typename AttrType>
@@ -552,7 +561,6 @@ class PADDLE_API OpMetaInfo {
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   std::vector<std::string> attrs_;
-
   // 2. func info
   KernelFunc kernel_fn_{nullptr};
   InferShapeFunc infer_shape_fn_{nullptr};
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index c268742fa567bffecb2fd17a773ab56aee019853..c58ebe69523eb97a4357bea061de64e5e01ec181 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -225,6 +225,22 @@ class PADDLE_API Tensor final {
    */
   bool is_selected_rows() const;
 
+  /**
+   * @brief Determine whether tensor is SparseCooTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_coo_tensor() const;
+
+  /**
+   * @brief Determine whether tensor is SparseCsrTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_csr_tensor() const;
+
   /* Part 3: Device and Backend methods */
 
   /**
@@ -324,7 +340,7 @@ class PADDLE_API Tensor final {
    *
    * @return std::shared_ptr<phi::TensorBase>
    */
-  std::shared_ptr<phi::TensorBase> impl() const;
+  const std::shared_ptr<phi::TensorBase>& impl() const;
 
   /**
    * @brief Set the implemention of current Tensor.
@@ -333,6 +349,13 @@ class PADDLE_API Tensor final {
    */
   void set_impl(const std::shared_ptr<phi::TensorBase>& impl);
 
+  /**
+   * @brief Set the implemention of current Tensor.
+   *
+   * @param impl
+   */
+  void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Get the stream where the tensor is currently located
@@ -397,7 +420,9 @@ class PADDLE_API Tensor final {
    * @param blocking, Should we copy this in sync way.
    * @return void
    */
-  void copy_(const Tensor& src, const bool blocking);
+  void copy_(const Tensor& src,
+             const phi::Place& target_place,
+             const bool blocking);
   /**
    * @brief Cast datatype from one to another
    *
@@ -472,7 +497,21 @@ class PADDLE_API Tensor final {
    */
   void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);
 
-  /* Part 9: Auto generated Tensor methods */
+  /* Part 9: Inplace methods */
+
+  /**
+   * @brief Increase inplace version
+   */
+  void bump_inplace_version();
+
+  /**
+   * @brief Get current inplace version
+   *
+   * @return uint32_t
+   */
+  uint32_t current_inplace_version();
+
+  /* Part 10: Auto generated Tensor methods */
 
  private:
   /**
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 42bf7a8103f837195775b33daf301a7d2e0f4c44..4cbca07236208281f38984022d17b6cb88af8ed8 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
 cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index e1ebe8c6465cfdd7f8213c0a31416bc77412221c..0c11e2df65d0db23b4e080bf041c78d976714013 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -95,12 +95,8 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (!out->initialized()) {
-    auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
-    out->set_impl(dense_tensor);
-    return dense_tensor.get();
+  if (out->impl() == nullptr) {
+    out->set_impl(std::make_shared<phi::DenseTensor>());
   }
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
@@ -111,9 +107,7 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
-    auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
+    auto tensor_ptr = std::make_shared<phi::DenseTensor>();
     results[i] = tensor_ptr.get();
     out->emplace_back();
     out->back().set_impl(tensor_ptr);
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2..e280ab626da74a9b0951925f7472fa49996691cb 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
-    phi::DenseTensor result(
-        phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPhiPlace(target_args_def.backend)),
-        {out.dtype(), out.dims(), out.layout()});
+    phi::DenseTensor result;
     framework::TransDataDevice(
         out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
@@ -190,14 +187,14 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
            tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
        !NeedTransformLayout(
            tensor_in->layout(), target_args_def.layout, transform_flag))) {
-    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in);
+    return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
   }
 
   phi::DenseTensor out =
       TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
                     target_args_def,
                     transform_flag);
-  return std::make_shared<phi::DenseTensor>(out);
+  return std::make_shared<phi::DenseTensor>(std::move(out));
 }
 
 std::shared_ptr<phi::DenseTensor> PrepareData(
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 51d51c954de81d8e9116304e31374ce8d9934305..14dba664c41b3d7b138630c739bfe7b934d04e9f 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -51,7 +51,8 @@ void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) {
   input_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackInputs(std::vector<Tensor>&& inputs) {
+void CustomOpKernelContext::EmplaceBackInputs(
+    const std::vector<Tensor>& inputs) {
   size_t index = inputs_.size();
   input_range_.emplace_back(std::make_pair(index, index + inputs.size()));
   inputs_.insert(inputs_.end(),
@@ -65,7 +66,8 @@ void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) {
   output_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
+void CustomOpKernelContext::EmplaceBackOutputs(
+    const std::vector<Tensor>& outputs) {
   size_t index = outputs_.size();
   output_range_.emplace_back(std::make_pair(index, index + outputs.size()));
   outputs_.insert(outputs_.end(),
@@ -75,6 +77,8 @@ void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
 
 void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
+  VLOG(7) << "attrs_ No." << attrs_.size() - 1
+          << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }
 
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
@@ -102,6 +106,15 @@ std::vector<Tensor*> CustomOpKernelContext::MutableOutputBetweeen(size_t start,
   return rlt;
 }
 
+std::vector<Tensor> CustomOpKernelContext::OutputsBetweeen(size_t start,
+                                                           size_t end) {
+  std::vector<Tensor> rlt;
+  for (size_t i = start; i < end; ++i) {
+    rlt.emplace_back(outputs_.at(i));
+  }
+  return rlt;
+}
+
 std::vector<Tensor>* CustomOpKernelContext::AllMutableOutput() {
   return &outputs_;
 }
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 832c19361e5eb03419fe988c9a30304b5993afdf..8f8de02e49bdff8d3e026ca6dbed8948637260ae 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -25,25 +25,24 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_coo";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_coo";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -62,18 +61,18 @@ Tensor to_sparse_coo_impl(const Tensor& x,
 
   // 4. InferMeta
   auto indices_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_indices(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(indices_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
@@ -88,23 +87,23 @@ Tensor to_sparse_coo_impl(const Tensor& x,
   return out;
 }
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_csr";
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     kernel_name = "sparse_coo_to_csr";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -122,24 +121,24 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
 
   // 4. InferMeta
   auto crows_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
   auto cols_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_crows(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(crows_meta));
   phi::DenseTensor non_zero_cols(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(cols_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
@@ -154,24 +153,25 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   return out;
 }
 
-Tensor to_dense_impl(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "sparse_coo_to_dense";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_dense";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -194,7 +194,7 @@ Tensor to_dense_impl(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
index 293b2cfa3d33480ccccd0f601f8e15c639b93e1e..6053d281f0ff169db2868cb7fd949b75f816616a 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,11 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_dense_impl(const Tensor& x, Backend backend);
+Tensor to_dense_impl(const Tensor& x);
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim);
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim);
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 311dd0fc30941d2afb9f1bc1e7ae57f3a449a254..066287d4244797e316a34d524eca171e94afcfdf 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -46,6 +48,7 @@ limitations under the License. */
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
@@ -111,8 +114,8 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                   "touching underlying data, this requires the total size of "
                   "the tensor to remain constant.";
   if (is_dense_tensor()) {
-    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->set_meta(
-        phi::DenseTensorMeta(dtype(), phi::make_ddim(shape)));
+    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->Resize(
+        phi::make_ddim(shape));
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
@@ -131,6 +134,12 @@ bool Tensor::is_dense_tensor() const {
 bool Tensor::is_selected_rows() const {
   return phi::SelectedRows::classof(impl_.get());
 }
+bool Tensor::is_sparse_coo_tensor() const {
+  return phi::SparseCooTensor::classof(impl_.get());
+}
+bool Tensor::is_sparse_csr_tensor() const {
+  return phi::SparseCsrTensor::classof(impl_.get());
+}
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
@@ -142,7 +151,12 @@ PlaceType Tensor::place() const {
 }
 
 paddle::platform::Place Tensor::inner_place() const {
-  return ConvertExtPlaceToInnerPlace(place());
+  PADDLE_ENFORCE_NOT_NULL(
+      impl_,
+      phi::errors::PermissionDenied(
+          "Null pointer error, the impl_ of Tensor should not be "
+          "Null when calling Tensor::inner_place()."));
+  return impl_->place();
 }
 
 bool Tensor::is_cpu() const {
@@ -286,12 +300,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
   }
 }
 
-std::shared_ptr<phi::TensorBase> Tensor::impl() const { return impl_; }
+const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
   impl_ = impl;
 }
 
+void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
+  impl_ = std::move(impl);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuStream_t Tensor::stream() const {
   return platform::stream::get_current_stream(-1)->raw_stream();
@@ -337,5 +355,36 @@ void Tensor::set_autograd_meta(
   autograd_meta_ = std::move(autograd_meta);
 }
 
+void Tensor::bump_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: before bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+    inplace_version_counter.Bump();
+    VLOG(3) << "yoki: after bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "bump_inplace_version is only supported on DenseTensor now."));
+  }
+}
+
+uint32_t Tensor::current_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: print version: "
+            << inplace_version_counter.CurrentVersion();
+    return inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "current_inplace_version is only supported on DenseTensor now."));
+  }
+  return 0;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index aefa26952d1e5f224112576bfbd74be80cca72cc..cc797507e68ec11005d6ac35d5dca2d19418598d 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -15,12 +15,16 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace experimental {
-
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
 Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
@@ -66,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
 template PADDLE_API Tensor
 Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
 
-void Tensor::copy_(const Tensor &src, bool blocking) {
+void Tensor::copy_(const Tensor &src,
+                   const phi::Place &target_place,
+                   bool blocking) {
   if (!src.is_initialized()) {
+    VLOG(8) << "Src is empty, skip copy";
     return;
   }
+  // Prepare copy kernel key and outputs
+  auto kernel_key_set = ParseKernelKeyByInputArgs(src);
+  KernelType kernel_type = ParseKernelTypeByInputArgs(src);
   VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
+  if (is_initialized()) {
     PADDLE_ENFORCE_EQ(dtype(),
                       src.dtype(),
                       platform::errors::PreconditionNotMet(
@@ -86,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) {
                           "Copy cannot be performed!",
                           name(),
                           src.name()));
+    PADDLE_ENFORCE_EQ(target_place,
+                      inner_place(),
+                      platform::errors::PreconditionNotMet(
+                          "Place is different of dst tensor and args %s, which "
+                          "current tensor holds %s "
+                          "Copy cannot be performed!",
+                          target_place.DebugString(),
+                          inner_place().DebugString()));
+    kernel_key_set.backend_set =
+        kernel_key_set.backend_set |
+        BackendSet(phi::TransToPhiBackend(inner_place()));
+  } else {
+    // Deep Copy AutoGrad info from src to self.
+    *autograd_meta_ = *(src.autograd_meta_);
+  }
+
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::DenseTensor &,
+                                      phi::Place,
+                                      bool,
+                                      phi::DenseTensor *);
+    SetKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::DenseTensor *>(impl_.get()));
+  } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::SelectedRows &,
+                                      phi::Place,
+                                      bool,
+                                      phi::SelectedRows *);
+    SetSelectedRowsKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::SelectedRows *>(impl_.get()));
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "We currently only support dense tensor copy for now and if u need to "
+        "copy selected rows please raise a issue."));
   }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 6d056b54b70058e33501083d9754aa27466c0f59..271a58222f0c0f6b60642482691bed635e4d5f3c 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 1c9f7c3a8683daaf26cb87b23e50284d0329c4a8..3d183ea7fee8b17a7037a3fd9a6b2999605d8e25 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -40,6 +40,13 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
   auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        tensor.numel(),
+        1UL,
+        platform::errors::InvalidArgument("The DenseTensor used to construct "
+                                          "the Scalar contains more than 1 "
+                                          "value, it contains `%d` values.",
+                                          tensor.numel()));
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index bbd4966b7274f88ad4fad47dfdf7ce8e50ae2a3a..6315fe15afdf1ecd9c7657396468320eda7d88c1 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -523,6 +523,15 @@ struct CustomRuntimeParams {
   char reserved[32];
 };
 
+#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params)             \
+  if ((params)->size != sizeof(DevicePluginParams) &&           \
+      (params)->interface->size != sizeof(C_DeviceInterface)) { \
+    return;                                                     \
+  }                                                             \
+  (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \
+  (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \
+  (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION;
+
 // Plugin implement it and fill CustomRuntimeParams
 void InitPlugin(CustomRuntimeParams*);
 
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 1ffe38d8e1f4ce59aa819a5eaa46c75d5fded5b0..35339aed0f3e1cd87ac65855e0255fa3277a6bfb 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
   return platform_manager;
 }
 
+void DeviceManager::Clear() {
+  Instance().device_map_.clear();
+  Instance().device_impl_map_.clear();
+}
+
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   std::vector<std::string> libraries;
   std::regex express(".*\\.so");
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index c0911a0f8d50c52697b748f3726faded5a428694..39eef27b4a607bd3af75a6b5dde07f715e5537e5 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -158,6 +158,8 @@ class DeviceManager {
 
   static std::vector<size_t> GetDeviceList(const std::string& device_type);
 
+  static void Clear();
+
  private:
   DISABLE_COPY_AND_ASSIGN(DeviceManager);
   DeviceManager() {}
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 09deb575f2414a7a101c7f02d040ca1f4bd1a7f8..0394835aa8b700ba4f9ee9b106661e2d70fc50b6 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -654,10 +654,17 @@ struct GPUContext::Impl {
   }
 
   void AddStreamCallback(const std::function<void()>& callback) const {
-    // TODO(wilber): Do we need ThreadPool?
-    auto* func = new std::function<void()>([this, callback] {
+    // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
+    // launch too
+    // many threads and result in thread oversubscription.
+    auto* callback_func = new std::function<void()>(std::move(callback));
+    auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
-      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+      VLOG(4) << "Stream callback";
+      last_future_ = std::async(std::launch::async, [callback_func]() {
+        std::unique_ptr<std::function<void()>> releaser(callback_func);
+        (*callback_func)();
+      });
     });
 
 #ifdef PADDLE_WITH_HIP
@@ -734,6 +741,10 @@ struct GPUContext::Impl {
 
 GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
+GPUContext::GPUContext(GPUContext&&) = default;
+
+GPUContext& GPUContext::operator=(GPUContext&&) = default;
+
 GPUContext::GPUContext(const GPUPlace& place)
     : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 3eb4360ad35382369681308b46050cc3e6e04ea0..cd08da1c0f2f8031a461a0410a89254823a6a903 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -77,6 +77,8 @@ class DnnWorkspaceHandle {
 class GPUContext : public DeviceContext {
  public:
   GPUContext();
+  GPUContext(GPUContext&&);
+  GPUContext& operator=(GPUContext&&);
 
   explicit GPUContext(const GPUPlace& place);
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 85a1424ee34e04b50a077f5d8ac88d0a0d2fbe78..9bf692703860f15601ad601970ea1f5b1316442b 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(phi_place SRCS place.cc)
+cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cd55c1e88bed6f805a72cff92024d9dc219a1a2
--- /dev/null
+++ b/paddle/phi/common/scalar.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/scalar.h"
+
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+// NOTE(xiongkun): why we put definition here?
+// test_custom_op can't include enforce.h, because enforce.h includes gflags.
+// so we decouple the include dependence of enforce.h by link.
+void ThrowTensorConvertError(int num) {
+  PADDLE_ENFORCE_EQ(num,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, but "
+                        "now Tensor has `%d` elements",
+                        num));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 72cef89d300c8d60811bde7cf667275b37fedc6f..5134f4eb72639650a0bde34f2abbb0e05ced13c7 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace experimental {
 
+void ThrowTensorConvertError(int);
+
 template <typename T>
 class ScalarBase {
  public:
@@ -104,11 +107,7 @@ class ScalarBase {
   // The Tensor must have one dim
   ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
     is_from_tensor_ = true;
-    PD_CHECK(
-        tensor.numel() == 1,
-        "The Scalar only supports Tensor with 1 element, but now Tensor has `",
-        tensor.numel(),
-        "` element.");
+    ThrowTensorConvertError(tensor.numel());
     switch (dtype_) {
       case DataType::FLOAT32:
         data_.f32 = tensor.template data<float>()[0];
@@ -156,6 +155,8 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a
+  // interface.
   bool FromTensor() const { return is_from_tensor_; }
 
   void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 25b80279ecf10619d97b8800b24ab5353c79745d..71cec011411641ffe34918f03162800b111275a2 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -89,6 +89,8 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+  // For compatibility with LoDTensorArray
+  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 00e9bff9bd5910ceedcca3dfb3a7a64ec88596df..946230cb169d20db56a46399552b629348c4783f 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -47,7 +47,15 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "mean_grad",
                                                            "max",
+                                                           "max_grad",
+                                                           "min",
+                                                           "min_grad",
+                                                           "prod",
+                                                           "prod_grad",
+                                                           "any",
+                                                           "all",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
@@ -55,6 +63,7 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "expand_grad",
                                                            "expand_as_grad",
                                                            "sum",
+                                                           "one_hot",
                                                            "sum_grad",
                                                            "top_k",
                                                            "top_k_grad"});
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index bc317da8d98ed4eb8abf8250f03c364b17c178b1..48778bb38e5487506f4b402176fff26cbe485de7 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -33,6 +33,10 @@ void CustomKernelMap::RegisterCustomKernel(const std::string& name,
 void CustomKernelMap::RegisterCustomKernels() {
   VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
+  if (kernels_.size() <= 0) {
+    LOG(INFO) << "No custom kernel info found in loaded lib(s).";
+    return;
+  }
   auto& kernels = KernelFactory::Instance().kernels();
   for (auto& pair : kernels_) {
     PADDLE_ENFORCE_NE(
@@ -60,9 +64,10 @@ void CustomKernelMap::RegisterCustomKernels() {
               << info_pair.first
               << "] to Paddle. It will be used like native ones.";
     }
-    kernels_[pair.first].clear();
   }
-  LOG(INFO) << "Successed in loading custom kernels.";
+  LOG(INFO) << "Successed in loading " << kernels_.size()
+            << " custom kernel(s) from loaded lib(s), will be "
+            << "used like native ones.";
   kernels_.clear();
 }
 
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index a32e0e44f469694c62ff33863971d3b04004ff37..234e3528c363b948c0a3e3b22d5ee676660fce76 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 
+void KernelContext::EmplaceBackInputsWithoutSetRange(
+    paddle::SmallVector<const TensorBase*> inputs) {
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
 void KernelContext::EmplaceBackOutput(TensorBase* output) {
   int index = outputs_.size();
   outputs_.emplace_back(output);
@@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs(
                   std::make_move_iterator(outputs.end()));
 }
 
+void KernelContext::EmplaceBackOutputsWithoutSetRange(
+    paddle::SmallVector<TensorBase*> outputs) {
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
 void KernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
 }
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 213ac47d30bfdd28541bd1b9cb24bf2053b1c939..d3ca1ffc61c42c06c2b33cccdb6f1037df237a24 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -52,12 +52,18 @@ class KernelContext {
 
   void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
 
+  void EmplaceBackInputsWithoutSetRange(
+      paddle::SmallVector<const TensorBase*> inputs);
+
   void EmplaceBackOutput(TensorBase* output);
 
   void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
   void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
+  void EmplaceBackOutputsWithoutSetRange(
+      paddle::SmallVector<TensorBase*> outputs);
+
   void EmplaceBackAttr(paddle::any attr);
 
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index be91409762635e8aabdd6953aa5527d94959e4b2..e502b9cb3e02536e8d764a4cbc5e1d5509960303 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -197,8 +197,16 @@ class Kernel {
 
   const KernelArgsDef& args_def() const { return args_def_; }
 
+  const TensorArgDef& InputAt(size_t idx) const {
+    return args_def_.input_defs().at(idx);
+  }
+
   TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
 
+  const TensorArgDef& OutputAt(size_t idx) const {
+    return args_def_.output_defs().at(idx);
+  }
+
   TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
 
   bool IsValid() { return fn_ != nullptr; }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index d9ed68593cd610790ee4a0015069ac5a8cfea61b..c3356eadcbd2156617a7a69324e7b440cc54b339 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -98,6 +98,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCooTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCsrTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -114,6 +136,16 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index eb114304f53ea08b05d36792330cf5bd3ebbee5d..bcbb1a4835b9d0397f6e85b7c44311bb9fe57209 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) {
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  if (meta_tensor.lod().size() == 0) {
+    // no need share
+    return;
+  }
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
         meta_tensor.lod();
@@ -110,7 +114,7 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
-TensorBase* MetaTensor::get_tensor() const { return tensor_; }
+TensorBase* MetaTensor::tensor() const { return tensor_; }
 
 void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
@@ -118,7 +122,7 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   if (is_dense_tensor || is_selected_rows) {
     set_dims(meta_tensor.dims());
     if (is_selected_rows) {
-      const auto in_tensor_base = meta_tensor.get_tensor();
+      const auto in_tensor_base = meta_tensor.tensor();
       PADDLE_ENFORCE_EQ(
           phi::SelectedRows::classof(in_tensor_base),
           true,
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 3971a9f7e99e0282cae5e4d1e61ee6eb28c4b9a7..10c3a7c1a3de376d21805a12ff0b2c98ab4fcbd3 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -26,11 +26,13 @@ namespace phi {
 // TODO(chenweihang): add other flags if needed
 struct MetaConfig {
   bool is_runtime{true};
-
+  bool is_run_mkldnn_kernel{false};
   MetaConfig() = default;
 
   // supporting implicit construction is easier to use
-  MetaConfig(bool is_runtime) : is_runtime(is_runtime) {}  // NOLINT
+  MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel)
+      : is_runtime(is_runtime),
+        is_run_mkldnn_kernel(is_run_mkldnn_kernel) {}  // NOLINT
 };
 
 class MetaTensor {
@@ -66,7 +68,7 @@ class MetaTensor {
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-  TensorBase* get_tensor() const;
+  TensorBase* tensor() const;
   TensorBase* tensor_;
 };
 
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4ddef5b0002e286181ce5ac1ad198136424861a9..37d1a234b5767a3873bda6b41e6e410df1c452af 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -64,6 +64,16 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad) {
+  const auto& dtype = out_grad.dtype();
+  x_grad->set_dims(x.dims());
+  x_grad->share_lod(x);
+  x_grad->set_dtype(dtype);
+}
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -93,6 +103,12 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) {
+  if (dx) {
+    dx->share_meta(x);
+  }
+}
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
@@ -102,17 +118,49 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
       dout.dims(),
       errors::InvalidArgument(
           "Input(Out) and its gradients should have the same shape."));
+
   dx->share_meta(dout);
 }
 
-void GatherNdGradInferMeta(const MetaTensor& x,
-                           const MetaTensor& index,
-                           const MetaTensor& out_grad,
-                           MetaTensor* x_grad) {
-  const auto& dtype = out_grad.dtype();
-  x_grad->set_dims(x.dims());
-  x_grad->share_lod(x);
-  x_grad->set_dtype(dtype);
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx) {
+  dx->share_meta(x);
 }
 
 void ScatterGradInferMeta(const MetaTensor& index,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index f7b0eed5dd929e180810af52914e9a3139676e8a..260fbfe7197912fd3dd5b9103a0a991a45d55816 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -21,6 +21,10 @@ limitations under the License. */
 
 namespace phi {
 
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         const MetaTensor& y,
                                         const MetaTensor& weight,
@@ -30,6 +34,11 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         MetaTensor* dweight,
                                         MetaTensor* dbias);
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -42,11 +51,48 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
                                  MetaTensor* dy,
                                  MetaTensor* dz);
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx);
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
                                 MetaTensor* dx);
 
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx);
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx);
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx);
+
 void ScatterGradInferMeta(const MetaTensor& index,
                           const MetaTensor& updates,
                           const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 641956c4d9de796bed166e1f6238ff6988601bec..aabb944db30b9f30394f092c245bc0307d8bbf3f 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -18,260 +18,199 @@ limitations under the License. */
 #include <vector>
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
 namespace phi {
+namespace detail {
 
-void CompareInferMeta(const MetaTensor& x,
-                      const MetaTensor& y,
-                      int axis,
-                      MetaTensor* out) {
-  auto dim_x = x.dims();
-  auto dim_y = y.dims();
+static void BinarySameInputDimsCheck(const MetaTensor& x,
+                                     const MetaTensor& y,
+                                     MetaConfig config) {
+  auto input_dim = x.dims();
+  auto other_dim = y.dims();
+  PADDLE_ENFORCE_EQ(input_dim.size(),
+                    other_dim.size(),
+                    phi::errors::PreconditionNotMet(
+                        "Input(Input) and Input(Other) must have the same "
+                        "dimension size."));
+  int n = input_dim.size();
+  bool is_runtime = config.is_runtime;
+  for (int i = 0; i < n; i++) {
+    if (is_runtime) {
+      PADDLE_ENFORCE_EQ(input_dim[i],
+                        other_dim[i],
+                        phi::errors::PreconditionNotMet(
+                            "The value at dim %d of Input(Input) is not "
+                            "equal to the Input(Other): %ld != %ld.",
+                            i,
+                            input_dim[i],
+                            other_dim[i]));
+    } else {
+      if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
+        PADDLE_ENFORCE_EQ(input_dim[i],
+                          other_dim[i],
+                          phi::errors::PreconditionNotMet(
+                              "The value at dim %d of Input(Input) is not "
+                              "equal to the Input(Other): %ld != %ld.",
+                              i,
+                              input_dim[i],
+                              other_dim[i]));
+      }
+    }
+  }
+}
 
-  if (dim_x == dim_y) {
-    out->share_meta(x);
-  } else {
-    int max_dim = std::max(dim_x.size(), dim_y.size());
-    int axis = std::abs(dim_x.size() - dim_y.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    funcs::GetBroadcastDimsArrays(dim_x,
-                                  dim_y,
-                                  x_dims_array.data(),
-                                  y_dims_array.data(),
-                                  out_dims_array.data(),
-                                  max_dim,
-                                  axis);
+}  // namespace detail
 
-    out->set_dims(make_ddim(out_dims_array));
-    out->share_lod(x);
-  }
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
 
+  out->set_dims(phi::make_ddim({1}));
   out->set_dtype(DataType::BOOL);
 }
 
-void CompareAllInferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         MetaTensor* out) {
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config) {
   auto dim_x = x.dims();
-  auto dim_y = y.dims();
-  PADDLE_ENFORCE_GE(
-      dim_x.size(),
-      dim_y.size(),
-      errors::InvalidArgument(
-          "The size of dim_y should not be greater than dim_x's."));
-  out->share_lod(x);
-  out->set_dims(make_ddim({1}));
-  out->set_dtype(DataType::BOOL);
-}
-
-void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto x_rank = static_cast<size_t>(x_dims.size());
-  PADDLE_ENFORCE_EQ(true,
-                    1 == x_rank || 2 == x_rank,
-                    phi::errors::PreconditionNotMet(
-                        "ShapeError: The dimensions of input tensor X (%s) "
-                        "should be 1 or 2",
-                        x_dims.to_str()));
-
-  auto y_dims = y.dims();
-  PADDLE_ENFORCE_EQ(
-      true,
-      x_rank == static_cast<size_t>(y_dims.size()),
-      phi::errors::PreconditionNotMet(
-          "ShapeError: The shape of input tensor Y: %s should match with "
-          "input tenosr X: %s",
-          y_dims.to_str(),
-          x_dims.to_str()));
-  bool shape_match = true;
-  for (size_t i = 0; i < x_rank; ++i) {
-    if (x_dims[i] != y_dims[i]) {
-      shape_match = false;
-      break;
+  auto dim_target = label.dims();
+  PADDLE_ENFORCE_EQ(dim_x.size(),
+                    dim_target.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) rank and Input(Target) rank should be "
+                        "same, but received X rank(%d) != Target rank(%d)",
+                        dim_x.size(),
+                        dim_target.size()));
+  for (int i = 0; i < dim_x.size(); i++) {
+    if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[i],
+          dim_target[i],
+          phi::errors::InvalidArgument(
+              "Input(X) and Input(Target) should in same shape. but received "
+              "X dimension[%d](%d) != Target dimension[%d](%d)",
+              i,
+              dim_x[i],
+              i,
+              dim_target[i]));
     }
   }
 
-  PADDLE_ENFORCE_EQ(true,
-                    shape_match,
-                    phi::errors::PreconditionNotMet(
-                        "ShapeError: The shape of input tensor X: %s should "
-                        "be exactly the same "
-                        "with input tensor Y: %s",
-                        x_dims.to_str(),
-                        y_dims.to_str()));
+  auto reduction_valid = "mean" == reduction || "sum" == reduction ||
+                         "batchmean" == reduction || "none" == reduction;
+  PADDLE_ENFORCE_EQ(
+      reduction_valid,
+      true,
+      phi::errors::InvalidArgument(
+          "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
 
-  x_dims[x_dims.size() - 1] = 1;
-  out->set_dims(x_dims);
+  if ("none" == reduction) {
+    out->set_dims(dim_x);
+  } else {
+    out->set_dims({1});
+  }
   out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
 }
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out) {
-  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
-  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
-  auto ndims_x = dims_x.size();
-  auto ndims_y = dims_y.size();
-  PADDLE_ENFORCE_GT(ndims_x,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(x) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_GT(ndims_y,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(y) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-
-  bool x_broadcasted = false, y_broadcasted = false;
-  if (ndims_x == 1) {
-    dims_x.insert(dims_x.begin(), 1);
-    ndims_x = 2;
-    x_broadcasted = true;
-  }
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  out->share_meta(x);
+}
 
-  if (ndims_y == 1) {
-    dims_y.push_back(1);
-    ndims_y = 2;
-    y_broadcasted = true;
-  }
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
 
-  size_t M, N;
-  if (trans_x) {
-    M = dims_x[ndims_x - 1];
-  } else {
-    M = dims_x[ndims_x - 2];
-  }
-  if (trans_y) {
-    N = dims_y[ndims_y - 2];
-  } else {
-    N = dims_y[ndims_y - 1];
-  }
+  int rank = input_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        label_dims.size()));
 
-  std::vector<int64_t> new_dims;
-  if (ndims_x > ndims_y) {
-    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-  } else if (ndims_x < ndims_y) {
-    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
-  } else {
-    new_dims.reserve(ndims_x);
-    for (size_t i = 0; i < ndims_x - 2; ++i) {
-      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-    }
-  }
-  if (!x_broadcasted) {
-    new_dims.push_back(M);
-  }
-  if (!y_broadcasted) {
-    new_dims.push_back(N);
-  }
-  if (x_broadcasted && y_broadcasted) {
-    new_dims.push_back(1);
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+    check = false;
   }
 
-  auto ddim_out = phi::make_ddim(new_dims);
-
-  out->set_dims(ddim_out);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-}
-
-void ElementwiseInferMeta(const MetaTensor& x,
-                          const MetaTensor& y,
-                          MetaTensor* out) {
-  return ElementwiseRawInferMeta(x, y, -1, std::move(out));
-}
-
-void ElementwiseRawInferMeta(const MetaTensor& x,
-                             const MetaTensor& y,
-                             int axis,
-                             MetaTensor* out) {
-  if (x.dims() != y.dims()) {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    if (x_dims.size() == y_dims.size()) {
-      PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
-                        true,
-                        phi::errors::InvalidArgument(
-                            "axis should be -1 or 0 while the dimension of "
-                            "tensor X (%s) is equal to the dimension of "
-                            "tensor Y (%s), but received axis: %s",
-                            x_dims.size(),
-                            y_dims.size(),
-                            axis));
-    }
-    PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
-                      true,
+  if (check) {
+    PADDLE_ENFORCE_EQ(input_dims,
+                      label_dims,
                       phi::errors::InvalidArgument(
-                          "The axis range must be [%s, %s), but axis is %s. "
-                          "Please set the axis again.",
-                          -1 * max_dim,
-                          max_dim,
-                          axis));
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    funcs::GetBroadcastDimsArrays(x_dims,
-                                  y_dims,
-                                  x_dims_array.data(),
-                                  y_dims_array.data(),
-                                  out_dims_array.data(),
-                                  max_dim,
-                                  axis);
-    auto out_dims = phi::make_ddim(out_dims_array);
-    out->set_dims(out_dims);
-  } else {
-    out->set_dims(x.dims());
+                          "Input(X) and Input(Label) shall have the same "
+                          "shape. But received: the shape of Input(X) is "
+                          "[%s], the shape of Input(Label) is [%s].",
+                          input_dims,
+                          label_dims));
   }
 
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-  out->share_lod(x);
+  out->set_dims(input_dims);
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
 }
 
-void HuberLossInferMeta(const MetaTensor& input,
-                        const MetaTensor& label,
-                        float delta,
-                        MetaTensor* out,
-                        MetaTensor* residual,
-                        MetaConfig config) {
-  auto input_dims = input.dims();
-  auto label_dims = label.dims();
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
 
-  PADDLE_ENFORCE_EQ(input_dims.size(),
-                    label_dims.size(),
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
                     phi::errors::InvalidArgument(
-                        "Input(input) rank and Input(label) rank should be "
-                        "same, but received input rank(%d) != label rank(%d)",
-                        input_dims.size(),
-                        label_dims.size()));
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
 
-  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
-                             phi::contain_unknown_dim(label_dims);
-  if (config.is_runtime || !contain_unknown_dim) {
     PADDLE_ENFORCE_EQ(
-        input_dims,
-        label_dims,
+        weights_dim[0],
+        input_dim[0],
         phi::errors::InvalidArgument(
-            "The Input(input) and Input(label) should have the same "
-            "shape, but received input shape [%s] != label shape [%s]",
-            input_dims,
-            label_dims));
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
   }
 
-  auto out_dims = label_dims;
-  residual->set_dims(out_dims);
-  out->set_dims(out_dims);
-  out->share_lod(input);
+  out->share_lod(x);
 }
 
 void CholeskySolveInferMeta(const MetaTensor& x,
@@ -328,100 +267,51 @@ void CholeskySolveInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
-void TriangularSolveInferMeta(const MetaTensor& x,
-                              const MetaTensor& y,
-                              bool upper,
-                              bool transpose,
-                              bool unitriangular,
-                              MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  auto x_dims_n = x_dims.size();
-  auto y_dims_n = y_dims.size();
-
-  PADDLE_ENFORCE_GE(x_dims_n,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The input tensor X's dimensions of TriangularSolveOp "
-                        "should be >= 2. But received X's "
-                        "dimensions = %d, X's shape = [%s]",
-                        x_dims.size(),
-                        x_dims));
-
-  PADDLE_ENFORCE_GE(y_dims_n,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The input tensor Y's dimensions of TriangularSolveOp "
-                        "should be >=2. But received Y's "
-                        "dimensions = %d, Y's shape = [%s]",
-                        y_dims.size(),
-                        y_dims));
-
-  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
-                    x_dims[x_dims_n - 1],
-                    phi::errors::InvalidArgument(
-                        "The inner-most 2 dimensions of Input(X) all should "
-                        "be square matrices "
-                        "But received X's shape[-2] = %d and shape[-1] = %d.",
-                        x_dims[x_dims_n - 2],
-                        x_dims[x_dims_n - 1]));
-
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
-  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
 
-  std::vector<int64_t> expand_batch_portion =
-      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+  if (dim_x == dim_y) {
+    out->share_meta(x);
+  } else {
+    int max_dim = std::max(dim_x.size(), dim_y.size());
+    int axis = std::abs(dim_x.size() - dim_y.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(dim_x,
+                                  dim_y,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
 
-  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-  y_broadcast_dims.insert(y_broadcast_dims.end(),
-                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+    out->set_dims(make_ddim(out_dims_array));
+    out->share_lod(x);
+  }
 
-  // dim of 'out' is the same with 'Y' after broadcast
-  out->set_dims(phi::make_ddim(y_broadcast_dims));
-  out->set_dtype(y.dtype());
-  out->set_layout(y.layout());
-  out->share_lod(y);
+  out->set_dtype(DataType::BOOL);
 }
 
-void IndexSampleInferMeta(const MetaTensor& x,
-                          const MetaTensor& y,
-                          MetaTensor* out,
-                          MetaConfig config) {
-  auto input_dims = x.dims();
-  PADDLE_ENFORCE_EQ(input_dims.size(),
-                    2,
-                    errors::InvalidArgument(
-                        "Inputs(X) shape of IndexSample op should be 2-D, but "
-                        "got X's shape = [%s], please check X shape.",
-                        input_dims));
-
-  auto index_dims = y.dims();
-  PADDLE_ENFORCE_EQ(
-      index_dims.size(),
-      2,
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      dim_y.size(),
       errors::InvalidArgument(
-          "Inputs(Index) shape of IndexSample op should be 2-D, but "
-          "got Index's shape [%s] , please check index shape.",
-          input_dims));
-  if (config.is_runtime) {
-    PADDLE_ENFORCE_EQ(input_dims[0],
-                      index_dims[0],
-                      errors::InvalidArgument(
-                          "Inputs(X)'s value of dimension 0 must same with "
-                          "Inputs(Index)'s value of dimension 0, but "
-                          "got %d of Inputs(X), and got %d of Inputs(Index), "
-                          "please check Inputs shape.",
-                          input_dims[0],
-                          index_dims[0]));
-  }
-  out->set_dtype(x.dtype());
-  out->set_dims(index_dims);
-  out->share_lod(y);
+          "The size of dim_y should not be greater than dim_x's."));
+  out->share_lod(x);
+  out->set_dims(make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
 }
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
@@ -467,116 +357,159 @@ void CrossInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  out->share_meta(x);
-}
-
-void SegmentPoolInferMeta(const MetaTensor& x,
-                          const MetaTensor& segment_ids,
-                          const std::string& pooltype,
-                          MetaTensor* out,
-                          MetaTensor* summed_ids,
-                          MetaConfig config) {
-  auto dims = x.dims();
-  dims[0] = -1;
-  out->set_dims(dims);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-
-  if (pooltype == "MEAN") {
-    summed_ids->set_dims({-1, 1});
-    summed_ids->set_dtype(x.dtype());
-    summed_ids->set_layout(x.layout());
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings_t,
+                   const std::string& padding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations_t,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
   }
-}
-
-void BCELossInferMeta(const MetaTensor& input,
-                      const MetaTensor& label,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto input_dims = input.dims();
-  auto label_dims = label.dims();
-
-  int rank = input_dims.size();
-  PADDLE_ENFORCE_EQ(rank,
-                    label_dims.size(),
-                    phi::errors::InvalidArgument(
-                        "Input(X) and Input(Label) shall have the same rank."
-                        "But received: the rank of Input(X) is [%d], "
-                        "the rank of Input(Label) is [%d].",
-                        rank,
-                        label_dims.size()));
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
 
-  bool check = true;
-  if ((!config.is_runtime) &&
-      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
-    check = false;
-  }
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() == 4 || in_dims.size() == 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+          "received: input's dimension is %u, input's shape is [%s].",
+          in_dims.size(),
+          in_dims));
 
-  if (check) {
-    PADDLE_ENFORCE_EQ(input_dims,
-                      label_dims,
-                      phi::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same "
-                          "shape. But received: the shape of Input(X) is "
-                          "[%s], the shape of Input(Label) is [%s].",
-                          input_dims,
-                          label_dims));
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      filter_dims.size(),
+      phi::errors::InvalidArgument(
+          "The input's dimension and filter's dimension of "
+          "Op(Conv) should be equal. But received: the input's shape is [%s], "
+          "the input's dimension is %d; the filter's shape is [%s],  "
+          "the filter's dimension is %d.",
+          in_dims,
+          in_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
   }
 
-  out->set_dims(input_dims);
-  out->set_dtype(input.dtype());
-  out->share_lod(input);
-}
-
-void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
-                       int minlength,
-                       MetaTensor* out) {
-  auto input_dim = x.dims();
-
-  PADDLE_ENFORCE_GE(minlength,
-                    0,
-                    phi::errors::InvalidArgument(
-                        "The minlength should be greater than or equal to 0."
-                        "But received minlength is %d",
-                        minlength));
-
+  int in_sub_stride_size = in_dims.size() - stride_size;
   PADDLE_ENFORCE_EQ(
-      input_dim.size(),
-      1,
-      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
-                                   "But the dimension of Input(X) is [%d]",
-                                   input_dim.size()));
-
-  if (weights.is_initialized()) {
-    auto weights_dim = weights->dims();
-    PADDLE_ENFORCE_EQ(weights_dim.size(),
-                      1,
-                      phi::errors::InvalidArgument(
-                          "The 'shape' of Input(Weights) must be 1-D tensor."
-                          "But the dimension of Input(Weights) is [%d]",
-                          weights_dim.size()));
+      in_dims.size(),
+      strides.size() + 2U,
+      phi::errors::InvalidArgument(
+          "The difference of input's dimension and Attr(strides)'s "
+          "length must be euqal to 2 for Op(Conv). "
+          "But received: input's dimension is %d, input's shape is [%s]; "
+          "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+          "difference of input's dimention and Attr(strides)'s length = %u.",
+          in_dims.size(),
+          in_dims,
+          strides.size(),
+          phi::make_ddim(strides),
+          in_sub_stride_size));
+
+  const auto input_channels =
+      channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
 
-    PADDLE_ENFORCE_EQ(
-        weights_dim[0],
-        input_dim[0],
-        phi::errors::InvalidArgument(
-            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-            "Input(X)."
-            "But received: the 'shape' of Input(Weights) is [%s],"
-            "the 'shape' of Input(X) is [%s]",
-            weights_dim,
-            input_dim));
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      filter_dims[1] * groups,
+      phi::errors::InvalidArgument(
+          "The number of input's channels should be equal to filter's channels "
+          "* groups for Op(Conv). But received: the input's channels is %d, "
+          "the input's shape is [%s]; the filter's channels is %d, the "
+          "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+          "The error may come from wrong data_format setting.",
+          input_channels,
+          in_dims,
+          filter_dims[1],
+          filter_dims,
+          groups,
+          data_format));
+  PADDLE_ENFORCE_EQ(
+      filter_dims[0] % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of output's channels (filter's first dimension) of "
+          "Op(Conv) should be divided by groups. But received: "
+          "the output channels is %d, the filter's shape is [%s], "
+          "the groups is %d.",
+          filter_dims[0],
+          filter_dims,
+          groups));
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GT(
+        filter_dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "the size of filter at axis 0 should be greater than 0"));
   }
-  out->set_dims(phi::make_ddim({-1}));
-  if (weights.is_initialized()) {
-    out->set_dtype(weights->dtype());
+
+  DDim in_data_dims;
+  if (channel_last) {
+    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
   } else {
-    out->set_dtype(x.dtype());
+    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
   }
 
-  out->share_lod(x);
+  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  phi::UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  std::vector<int64_t> output_shape({in_dims[0]});
+  if (!channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+  for (int i = 0; i < in_data_dims.size(); ++i) {
+    if ((!config.is_runtime) &&
+        (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+      output_shape.push_back(-1);
+    } else {
+      const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1;
+      int output_size =
+          (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+              strides[i] +
+          1;
+      output_shape.push_back(output_size);
+    }
+  }
+  if (channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(input.dtype());
 }
 
 void DistInferMeta(const MetaTensor& x,
@@ -602,6 +535,180 @@ void DistInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto x_rank = static_cast<size_t>(x_dims.size());
+  PADDLE_ENFORCE_EQ(true,
+                    1 == x_rank || 2 == x_rank,
+                    phi::errors::PreconditionNotMet(
+                        "ShapeError: The dimensions of input tensor X (%s) "
+                        "should be 1 or 2",
+                        x_dims.to_str()));
+
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      true,
+      x_rank == static_cast<size_t>(y_dims.size()),
+      phi::errors::PreconditionNotMet(
+          "ShapeError: The shape of input tensor Y: %s should match with "
+          "input tenosr X: %s",
+          y_dims.to_str(),
+          x_dims.to_str()));
+  bool shape_match = true;
+  for (size_t i = 0; i < x_rank; ++i) {
+    if (x_dims[i] != y_dims[i]) {
+      shape_match = false;
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(true,
+                    shape_match,
+                    phi::errors::PreconditionNotMet(
+                        "ShapeError: The shape of input tensor X: %s should "
+                        "be exactly the same "
+                        "with input tensor Y: %s",
+                        x_dims.to_str(),
+                        y_dims.to_str()));
+
+  x_dims[x_dims.size() - 1] = 1;
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
+void ElementwiseInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out) {
+  return ElementwiseRawInferMeta(x, y, -1, std::move(out));
+}
+
+void ElementwiseRawInferMeta(const MetaTensor& x,
+                             const MetaTensor& y,
+                             int axis,
+                             MetaTensor* out) {
+  if (x.dims() != y.dims()) {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    if (x_dims.size() == y_dims.size()) {
+      PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "axis should be -1 or 0 while the dimension of "
+                            "tensor X (%s) is equal to the dimension of "
+                            "tensor Y (%s), but received axis: %s",
+                            x_dims.size(),
+                            y_dims.size(),
+                            axis));
+    }
+    PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The axis range must be [%s, %s), but axis is %s. "
+                          "Please set the axis again.",
+                          -1 * max_dim,
+                          max_dim,
+                          axis));
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(x_dims,
+                                  y_dims,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
+    auto out_dims = phi::make_ddim(out_dims_array);
+    out->set_dims(out_dims);
+  } else {
+    out->set_dims(x.dims());
+  }
+
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out) {
+#define MAX_RANK_SUPPORTED 6
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      target_shape.size(),
+      static_cast<size_t>(x_dims.size()),
+      phi::errors::InvalidArgument(
+          "The rank of target_shape must be greater than or equal "
+          "to the rank of Input(X). But received Input(X): input "
+          "rank %u; received target_shape: rank %u.",
+          x_dims.size(),
+          target_shape.size()));
+  PADDLE_ENFORCE_LE(target_shape.size(),
+                    MAX_RANK_SUPPORTED,
+                    phi::errors::InvalidArgument(
+                        "The rank of target_shape must be less than or equal "
+                        "to %d. But received: rank %u.",
+                        MAX_RANK_SUPPORTED,
+                        target_shape.size()));
+  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dtype(x.dtype());
+#undef MAX_RANK_SUPPORTED
+}
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out) {
+  auto index_dims = index.dims();
+
+  if (index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(
+        index_dims[1],
+        1,
+        phi::errors::InvalidArgument(
+            "The last dim of index should be 1 when it is 2D, but we get %d",
+            index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The index should be 1D, when it is not 2D, but we get %d",
+            index_dims.size()));
+  }
+
+  auto input_dim = x.dims();
+  auto axis_v = axis.to<int>();
+  if (axis.FromTensor() || axis_v == 0) {
+    // if axis.FromTensor(), we can not obtain correct shape of output
+    int batch_size = index_dims[0];
+    phi::DDim output_dims(input_dim);
+    output_dims[0] = batch_size;
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  } else {
+    int index_size = index_dims[0];
+    std::vector<int> out_dim_vec;
+    for (int i = 0; i < axis_v; i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    out_dim_vec.push_back(index_size);
+    for (int i = axis_v + 1; i < input_dim.size(); i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    auto output_dims = phi::make_ddim(out_dim_vec);
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  }
+}
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out) {
@@ -630,22 +737,197 @@ void GatherNdInferMeta(const MetaTensor& x,
     result_dims.emplace_back(x_dims[i]);
   }
 
-  out->set_dims(phi::make_ddim(result_dims));
-  out->share_lod(x);
-  out->set_dtype(x.dtype());
+  out->set_dims(phi::make_ddim(result_dims));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dims = x.dims();
+  auto grid_dims = grid.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
+                        "received X dimension size(%d)",
+                        x_dims.size()));
+  PADDLE_ENFORCE_EQ(grid_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
+                        "but received X dimension size(%d)",
+                        grid_dims.size()));
+  if (config.is_runtime || grid_dims[3] > 0) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[3],
+        2,
+        phi::errors::InvalidArgument(
+            "Input(Grid) dimension[3] should be 2, but received %d",
+            grid_dims[3]));
+  }
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[0],
+        x_dims[0],
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Grid) dimension[0] should be equal, but "
+            "received X dimension[0](%d) != Grid dimension[0](%d)",
+            x_dims[0],
+            grid_dims[0]));
+  }
+
+  out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void HuberLossInferMeta(const MetaTensor& input,
+                        const MetaTensor& label,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(input) rank and Input(label) rank should be "
+                        "same, but received input rank(%d) != label rank(%d)",
+                        input_dims.size(),
+                        label_dims.size()));
+
+  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
+                             phi::contain_unknown_dim(label_dims);
+  if (config.is_runtime || !contain_unknown_dim) {
+    PADDLE_ENFORCE_EQ(
+        input_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The Input(input) and Input(label) should have the same "
+            "shape, but received input shape [%s] != label shape [%s]",
+            input_dims,
+            label_dims));
+  }
+
+  auto out_dims = label_dims;
+  residual->set_dims(out_dims);
+  out->set_dims(out_dims);
+  out->share_lod(input);
+}
+
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Inputs(X) shape of IndexSample op should be 2-D, but "
+                        "got X's shape = [%s], please check X shape.",
+                        input_dims));
+
+  auto index_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      2,
+      errors::InvalidArgument(
+          "Inputs(Index) shape of IndexSample op should be 2-D, but "
+          "got Index's shape [%s] , please check index shape.",
+          input_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      index_dims[0],
+                      errors::InvalidArgument(
+                          "Inputs(X)'s value of dimension 0 must same with "
+                          "Inputs(Index)'s value of dimension 0, but "
+                          "got %d of Inputs(X), and got %d of Inputs(Index), "
+                          "please check Inputs shape.",
+                          input_dims[0],
+                          index_dims[0]));
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(index_dims);
+  out->share_lod(y);
+}
+
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output) {
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+
+  PADDLE_ENFORCE_EQ(
+      dim < input_dim.size() && dim >= (0 - input_dim.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+          input_dim.size(),
+          input_dim.size() - 1,
+          dim));
+
+  PADDLE_ENFORCE_EQ(
+      index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
+      true,
+      phi::errors::InvalidArgument(
+          "The 'shape' of Input(Index) must be 1-D tensor. "
+          "But received: the 'shape' of Input(Index) is [%s], "
+          "the dimension of Input(Index) is [%d].",
+          index_dim,
+          index_dim.size()));
+
+  PADDLE_ENFORCE_EQ(
+      index_dim[0] != 0,
+      true,
+      phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
+
+  auto output_dim = phi::vectorize(input_dim);
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  output_dim[dim] = index_dim[0];
+  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dtype(x.dtype());
+  output->set_layout(x.layout());
+  output->share_lod(x);
 }
 
-void GatherTreeMeta(const MetaTensor& ids,
-                    const MetaTensor& parents,
-                    MetaTensor* out) {
-  auto ids_dims = ids.dims();
-  auto parents_dims = parents.dims();
-  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The shape of Input(Parents) must be same with the "
-                        "shape of Input(Ids)."));
-  out->set_dims(ids_dims);
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  auto rank_x = dim_x.size();
+  auto rank_y = dim_y.size();
+  auto rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<int64_t> dim_out;
+  dim_out.reserve(rank);
+  for (int i = 0; i < rank; i++) {
+    int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
+    int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
+  }
+  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dtype(x.dtype());
 }
 
 void LogLossInferMeta(const MetaTensor& input,
@@ -690,6 +972,86 @@ void LogLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out) {
+  out->set_dims({-1});  // can not infer
+  out->set_dtype(x.dtype());
+}
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out) {
+  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
+  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(x) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(y) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+
+  bool x_broadcasted = false, y_broadcasted = false;
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
+
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
+
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
+
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+    }
+  }
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
+  }
+
+  auto ddim_out = phi::make_ddim(new_dims);
+
+  out->set_dims(ddim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   auto dim_x = x.dims();
   auto dim_vec = vec.dims();
@@ -720,6 +1082,176 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   out->share_lod(x);
 }
 
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  if (mode == "all") {
+    PADDLE_ENFORCE_EQ(phi::product(alpha.dims()),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'all', size of weight Alpha must be one. "
+                          "But recevied alpha's size: %d.",
+                          product(alpha.dims())));
+  } else if (mode == "channel") {
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      2,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC",
+                      true,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', data_format must be one of "
+                          "NCHW and NHWC. But recevied data_format: %s",
+                          data_format));
+    if (data_format == "NCHW" || config.is_run_mkldnn_kernel) {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[1]: %d",
+                            product(alpha.dims()),
+                            x_dim[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[%d]: %d",
+                            product(alpha.dims()),
+                            x_rank - 1,
+                            x_dim[x_rank - 1]));
+    }
+  } else if (mode == "element") {
+    auto alpha_dim = alpha.dims();
+    auto alpha_rank = alpha_dim.size();
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'element', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(
+        alpha_rank,
+        x_rank,
+        phi::errors::InvalidArgument(
+            "For mode 'element', rank of weight Alpha must be ",
+            "equal to the rank of input(x). But recevied alpha's rank: %d, "
+            "x's rank: %d.",
+            alpha_rank,
+            x_rank));
+    size_t x_product = 1;
+    size_t alpha_product = 1;
+    for (int64_t i = x_rank - 1; i > 0; i--) {
+      x_product *= x_dim[i];
+      alpha_product *= alpha_dim[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        alpha_product,
+        x_product,
+        phi::errors::InvalidArgument(
+            "For mode 'element', the size of weight Alpha must be "
+            "equal to the size of input(x). But recevied alpha's size: %d, "
+            "x's size: %d.",
+            alpha_product,
+            x_product));
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
+        "But recevied "
+        "mode: '%s'.",
+        mode));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out) {
+  auto sequences_dims = sorted_sequence.dims();
+  auto values_dims = value.dims();
+
+  bool flag = true;
+  if (sequences_dims.size() != values_dims.size()) {
+    flag = false;
+  }
+  const auto& sequences_dims_size = sequences_dims.size();
+  for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
+    if (sequences_dims[dim] != values_dims[dim]) {
+      flag = false;
+      break;
+    }
+  }
+  if (sequences_dims.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        flag,
+        true,
+        phi::errors::Unavailable(
+            "The dimensions of sorted_sequence tensor ( %s ) and values "
+            "tensor ( %s ) can not match. Because the input sorted_sequence "
+            "tensor must be 1 dimension or the first N-1 dimensions of "
+            "sorted_sequence tensor and input values tensor must match. "
+            "Please input appropriate sorted_sequence and values again! ",
+            sequences_dims,
+            values_dims));
+  }
+
+  if (out_int32) {
+    PADDLE_ENFORCE_LT(
+        sequences_dims[sequences_dims.size() - 1],
+        std::numeric_limits<int>::max(),
+        phi::errors::Unavailable(
+            "The size of sorted_sequence %d exceed the maximum limit d%. "
+            "Because the size of sorted_sequence should be less than the "
+            "output maximum value for int32 bit. Please set appropriate "
+            "sorted_sequence to meet this requirement! ",
+            sequences_dims[sequences_dims.size() - 1],
+            std::numeric_limits<int>::max()));
+  }
+
+  out->set_dims(values_dims);
+  if (out_int32) {
+    out->set_dtype(DataType::INT32);
+  } else {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
+}
+
 void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             const MetaTensor& label,
                                             bool normalize,
@@ -761,4 +1293,188 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
+}
+
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_imgsize = img_size.dims();
+  int anchor_num = anchors.size() / 2;
+
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      4,
+      phi::errors::InvalidArgument("Input(X) should be a 4-D tensor."
+                                   "But received X dimension(%s)",
+                                   dim_x.size()));
+  if (iou_aware) {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (6 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
+            "+ class_num)) while iou_aware is true."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(6+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (6 + class_num)));
+    PADDLE_ENFORCE_GE(
+        iou_aware_factor,
+        0,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should greater than or equal to 0."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+    PADDLE_ENFORCE_LE(
+        iou_aware_factor,
+        1,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should less than or equal to 1."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (5 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+            "+ class_num))."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(5+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (5 + class_num)));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize.size(),
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor."
+                                   "But received Imgsize size(%s)",
+                                   dim_imgsize.size()));
+  if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0],
+        dim_x[0],
+        phi::errors::InvalidArgument(
+            "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize[1],
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2."
+                                   "But received imgsize dim[1](%s).",
+                                   dim_imgsize[1]));
+  PADDLE_ENFORCE_GT(anchors.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be greater than 0."
+                        "But received anchors length(%s).",
+                        anchors.size()));
+  PADDLE_ENFORCE_EQ(anchors.size() % 2,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be even integer."
+                        "But received anchors length (%s)",
+                        anchors.size()));
+  PADDLE_ENFORCE_GT(class_num,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(class_num) should be an integer greater than 0."
+                        "But received class_num (%s)",
+                        class_num));
+
+  int box_num;
+  if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) {
+    box_num = dim_x[2] * dim_x[3] * anchor_num;
+  } else {
+    box_num = -1;
+  }
+  std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+  boxes->set_dims(phi::make_ddim(dim_boxes));
+  boxes->set_dtype(x.dtype());
+
+  std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+  scores->set_dims(phi::make_ddim(dim_scores));
+}
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
+PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index d2b16e557b06dc94107788995f0c26f1e27e1761..d770a096de7c922c674b7edda55ae8cb531a6d00 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace phi {
@@ -28,23 +29,71 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config = MetaConfig());
+
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
+
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out);
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out);
 
 void CompareInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
                       int axis,
                       MetaTensor* out);
 
-void CompareAllInferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         MetaTensor* out);
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
 
-void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out);
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out);
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
@@ -55,6 +104,29 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out);
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out);
+
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out);
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
+
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void HuberLossInferMeta(const MetaTensor& input_meta,
                         const MetaTensor& label_meta,
                         float delta,
@@ -62,67 +134,55 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
-void CholeskySolveInferMeta(const MetaTensor& x,
-                            const MetaTensor& y,
-                            bool upper,
-                            MetaTensor* out);
-
-void TriangularSolveInferMeta(const MetaTensor& x,
-                              const MetaTensor& y,
-                              bool upper,
-                              bool transpose,
-                              bool unitriangular,
-                              MetaTensor* out);
-
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
                           MetaConfig config = MetaConfig());
 
-void CrossInferMeta(const MetaTensor& x,
-                    const MetaTensor& y,
-                    int axis,
-                    MetaTensor* out);
-
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output);
 
-void SegmentPoolInferMeta(const MetaTensor& x,
-                          const MetaTensor& segment_ids,
-                          const std::string& pooltype,
-                          MetaTensor* out,
-                          MetaTensor* summed_ids,
-                          MetaConfig config = MetaConfig());
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
-void BCELossInferMeta(const MetaTensor& input,
+void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
+                      float epsilon,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
-void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
-                       int minlength,
-                       MetaTensor* out);
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out);
 
-void DistInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   float p,
-                   MetaTensor* out);
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out);
 
-void GatherNdInferMeta(const MetaTensor& x,
-                       const MetaTensor& index,
-                       MetaTensor* out);
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
-void GatherTreeMeta(const MetaTensor& ids,
-                    const MetaTensor& parents,
-                    MetaTensor* out);
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config);
 
-void LogLossInferMeta(const MetaTensor& input,
-                      const MetaTensor& label,
-                      float epsilon,
-                      MetaTensor* out,
-                      MetaConfig config = MetaConfig());
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out);
 
-void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
 
 void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             const MetaTensor& label,
@@ -131,4 +191,30 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             MetaTensor* out,
                                             MetaConfig config = MetaConfig());
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config = MetaConfig());
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index acce40713b82159e9e6fd902a30c8b269c6c4e52..3e9da9a217a0a8837d7edadc70401fdad04b4869 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
 #include <vector>
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
@@ -28,6 +30,98 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -108,96 +202,112 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out) {
-  auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_NE(
-      product(lr_dims),
-      0,
-      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
-                              "been initialized. You may need to confirm "
-                              "if you put exe.run(startup_program) "
-                              "after optimizer.minimize function."));
-  PADDLE_ENFORCE_EQ(
-      product(lr_dims),
-      1,
-      errors::InvalidArgument("Learning rate should have 1 dimension"));
-  auto beta1_pow_dims = beta1_pow.dims();
-  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
-                    1,
-                    errors::InvalidArgument(
-                        "Beta1 power accumulator should have 1 dimension"));
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and Grad input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      moment.dims(),
-      errors::InvalidArgument(
-          "Param and Moment input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      inf_norm.dims(),
-      errors::InvalidArgument(
-          "Param and InfNorm input of AdamaxOp should have same dimension"));
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout_str,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config) {
+  const auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size(); i++) {
+    PADDLE_ENFORCE_EQ(
+        (x_dims[i] == -1) || (x_dims[i] > 0),
+        true,
+        phi::errors::InvalidArgument(
+            "Each dimension of input tensor is expected to be -1 or a "
+            "positive number, but recieved %d. Input's shape is [%s].",
+            x_dims[i],
+            x_dims));
+  }
 
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
 
-  moment_out->set_dims(param_dims);
-  moment_out->set_dtype(moment.dtype());
-
-  inf_norm_out->set_dims(param_dims);
-  inf_norm_out->set_dtype(inf_norm.dtype());
-}
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input "
+          "X must greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X "
+          "must smaller than or equal to 5. But received: the shape of input X "
+          "= [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+
+  const int64_t C = ((config.is_run_mkldnn_kernel == true) ||
+                             (data_layout == DataLayout::kNCHW)
+                         ? x_dims[1]
+                         : x_dims[x_dims.size() - 1]);
+  auto scale_dim = scale.dims();
+  auto bias_dim = bias.dims();
 
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out) {
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and grad input of AdadeltaOp should have same dimension."));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_grad.dims(),
-      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
-                              "should have same dimension"));
   PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_update.dims(),
-      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
-                              "should have same dimension"));
-
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
-
-  avg_squared_grad_out->set_dims(param_dims);
-  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+      scale_dim.size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim,
+          scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim,
+                        bias_dim.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+    check = false;
+  }
 
-  avg_squared_update_out->set_dims(param_dims);
-  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C,
+                          scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C,
+                          bias_dim[0]));
+  }
+  y->set_dims(x_dims);
+  mean_out->set_dims({C});
+  variance_out->set_dims({C});
+  saved_mean->set_dims({C});
+  saved_variance->set_dims({C});
+  y->share_lod(x);
 }
 
 void BilinearTensorProductInferMeta(const MetaTensor& x,
@@ -369,6 +479,188 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out) {
+  const int64_t input_dims = x.dims()[0];
+  const int64_t label_dims = label.dims()[0];
+  PADDLE_ENFORCE_EQ(input_dims,
+                    label_dims,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of "
+                        "input and label is expected to be the same. "
+                        "But received input's first dimension is %d; "
+                        "label's first dimension is %d.",
+                        input_dims,
+                        label_dims));
+
+  std::vector<int64_t> output_shape({input_dims, 1});
+  out->set_dims(phi::make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) {
+  auto inputs_dims = GetMetaTensorsDim(x);
+
+  const size_t inputs_num = inputs_dims.size();
+  PADDLE_ENFORCE_GT(
+      inputs_num,
+      static_cast<size_t>(1),
+      phi::errors::InvalidArgument(
+          "The number of input tensors in multi_dot op should > 1."));
+
+  const size_t n = inputs_dims.size();
+  auto first_dim = inputs_dims[0];
+
+  bool is_vector = false;
+  phi::DDim out_dim;
+
+  PADDLE_ENFORCE_LT(
+      first_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the first tensor is 1D of size n view it as a row vector (1, n)
+  if (first_dim.size() == 1) {
+    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
+    is_vector = true;
+  }
+
+  auto last_dim = inputs_dims[n - 1];
+  PADDLE_ENFORCE_LT(
+      last_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the last tensor is 1D of size n view it as a column vector (n, 1)
+  if (last_dim.size() == 1) {
+    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
+    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
+  } else {
+    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
+                        : phi::make_ddim({first_dim[0], last_dim[1]});
+  }
+
+  auto width = first_dim[1];
+  for (size_t i = 1; i < n - 1; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
+                      static_cast<size_t>(2),
+                      phi::errors::InvalidArgument(
+                          "the input tensor of multi_dot op must be 2D."));
+
+    const auto& tmp_dim = inputs_dims[i];
+    PADDLE_ENFORCE_EQ(
+        tmp_dim[0],
+        width,
+        phi::errors::InvalidArgument(
+            "the input matrix does not meet the multiplication requirements."));
+    width = tmp_dim[1];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      last_dim[0],
+      width,
+      phi::errors::InvalidArgument(
+          "the input matrix does not meet the multiplication requirements."));
+
+  out->set_dims(out_dim);
+  out->set_dtype(x.at(0)->dtype());
+  out->share_lod(*x.at(0));
+}
+
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out) {
+  auto input_dims = x.dims();
+  auto rois_dims = rois.dims();
+
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      errors::InvalidArgument("The format of input tensor is NCHW"));
+  PADDLE_ENFORCE_EQ(rois_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  PADDLE_ENFORCE_EQ(rois_dims[1],
+                    4,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  if (rois_num.get_ptr()) {
+    auto rois_num_dims = rois_num->dims();
+    PADDLE_ENFORCE_EQ(
+        rois_num_dims.size(),
+        1,
+        errors::InvalidArgument("The second dimension of RoisNum should "
+                                "be 1, but received dimension is %d",
+                                rois_num_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[1],
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "the channel of X(%d) "
+          "should be equal to the product of "
+          "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+          input_dims[1],
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output height must be greater than 0"));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output width must be greater than 0"));
+  PADDLE_ENFORCE_GT(output_channels,
+                    1,
+                    errors::InvalidArgument(
+                        "The pooled output channels must greater than 1"));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      errors::InvalidArgument("The spatial scale must greater than 0."));
+
+  auto out_dims = input_dims;
+  out_dims[0] = rois_dims[0];
+  out_dims[1] =
+      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
@@ -395,3 +687,5 @@ void WhereInferMeta(const MetaTensor& condition,
 }
 
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 26bdc62302f18ad011fd3ab74f4b2dd708d4c1ef..068766c0e11671c93285c077ab2328ac20134a13 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,8 +18,48 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -32,6 +72,26 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -47,32 +107,37 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out);
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out);
+
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out);
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out);
-
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out);
-
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 506d3fd14ea3fd568ce2f77d7ce30408062279e9..081084567e840f287bb113ee567888f4032f5638 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -16,6 +16,12 @@ limitations under the License. */
 
 namespace phi {
 
+void CreateInferMeta(const ScalarArray& shape,
+                     DataType dtype,
+                     MetaTensor* out) {
+  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
+}
+
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
@@ -26,12 +32,6 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
   out->set_layout(layout);
 }
 
-void CreateInferMeta(const ScalarArray& shape,
-                     DataType dtype,
-                     MetaTensor* out) {
-  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
-}
-
 void EyeInferMeta(int64_t num_rows,
                   int64_t num_columns,
                   DataType dtype,
@@ -41,18 +41,6 @@ void EyeInferMeta(int64_t num_rows,
   out->set_dtype(dtype);
 }
 
-void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
-                                      float mean,
-                                      float std,
-                                      int seed,
-                                      DataType dtype,
-                                      MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape);
-  out->set_dims(out_dims);
-  out->set_dtype(dtype);
-  out->set_layout(DataLayout::NCHW);
-}
-
 void GaussianRandomInferMeta(const ScalarArray& shape,
                              float mean,
                              float std,
@@ -65,4 +53,16 @@ void GaussianRandomInferMeta(const ScalarArray& shape,
   out->set_layout(DataLayout::NCHW);
 }
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index bd0567486e4d62a9f6fe9adfa02727bfe79937e1..55e59b27e71cfb1d9b16a659e40d299ed3f2fc54 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,26 +27,21 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
 
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
                          MetaTensor* out);
 
-void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
-
 void EyeInferMeta(int64_t num_rows,
                   int64_t num_columns,
                   DataType dtype,
                   MetaTensor* out);
 
-void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
-                                      float mean,
-                                      float std,
-                                      int seed,
-                                      DataType dtype,
-                                      MetaTensor* out);
-
 void GaussianRandomInferMeta(const ScalarArray& shape,
                              float mean,
                              float std,
@@ -54,4 +49,11 @@ void GaussianRandomInferMeta(const ScalarArray& shape,
                              DataType dtype,
                              MetaTensor* out);
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 88ac2cb0f8d1b01ade0e58bc8f1253c67ad05981..556fb874470dd248dcf7c77e8a8ac3510bd6f63e 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -18,6 +18,58 @@ limitations under the License. */
 
 namespace phi {
 
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config) {
+  auto inference_dim = out.dims();
+  auto label_dim = label.dims();
+  // Assume indices has same shape as inference, because
+  // it's the output of topk.
+  PADDLE_ENFORCE_EQ(
+      label_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: label's dimensions of AccuracyOp must be 2. "
+          "But received label's dimensions = %d, label's shape = [%s]",
+          label_dim.size(),
+          label_dim));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(label_dim[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: label's second dimension of "
+                          "AccuracyOp must be 1. But received label's "
+                          "second dimension is = %d, label's shape = [%s]",
+                          label_dim[1],
+                          label_dim));
+    PADDLE_ENFORCE_EQ(
+        inference_dim[0],
+        label_dim[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: the output's num_rows of AccuracyOp must be"
+            " the same as label's num_rows. But received output's "
+            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
+            "label's "
+            "num_rows = %d",
+            inference_dim,
+            label_dim,
+            inference_dim[0],
+            label_dim[0]));
+  }
+
+  accuracy->set_dims({1});
+  accuracy->set_dtype(out.dtype());
+  correct->set_dims({1});
+  correct->set_dtype(out.dtype());
+  total->set_dims({1});
+  total->set_dtype(out.dtype());
+  accuracy->share_lod(out);
+}
+
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
                     const MetaTensor& y,
@@ -89,6 +141,107 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto w_dims = weight.dims();
+  DDim out_dims;
+  out_dims = funcs::GetOutputDims(x_dims, y_dims);
+  if (w_dims.size() > 1 || w_dims[0] != 1) {
+    out_dims = funcs::GetOutputDims(out_dims, w_dims);
+  }
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
                          paddle::optional<const MetaTensor&> weight,
@@ -169,6 +322,158 @@ void NllLossRawInferMeta(const MetaTensor& input,
   total_weight->set_dtype(input.dtype());
 }
 
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of boxes_num should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(x) in"
+                        "RoiAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(boxes) "
+                                                 "in RoiAlignOp should be 2. "
+                                                 "But the rank of boxes is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(boxes) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoiAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoiAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoiAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The second dimension of boxes_num should "
+                                     "be 1, but received dimension is %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The input data should be a four-dimensional "
+                        "tensor with [N,C,H,W], but received input data with "
+                        " %d dimension",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...], but received boxes is "
+          "%d-dimensional LoDTensor",
+          boxes_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims[1],
+      4,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
+          "the received data is %d",
+          boxes_dims[1]));
+
+  PADDLE_ENFORCE_GT(
+      pooled_height,
+      0,
+      phi::errors::OutOfRange("The pooled output height must be greater than 0"
+                              "but received height is %d",
+                              pooled_height));
+  PADDLE_ENFORCE_GT(
+      pooled_width,
+      0,
+      phi::errors::OutOfRange("The pooled output width must be greater than 0"
+                              "but received width is %d",
+                              pooled_width));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      phi::errors::OutOfRange("The spatial scale must be greater than 0, "
+                              "but received spatial scale is %f",
+                              spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  arg_max->set_dims(out_dims);
+  arg_max->set_dtype(DataType::INT64);
+}
+
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       const MetaTensor& updates,
@@ -319,156 +624,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input,
   scores->set_dtype(length.dtype());
 }
 
-void LerpInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   const MetaTensor& weight,
-                   MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  auto w_dims = weight.dims();
-  DDim out_dims;
-  out_dims = funcs::GetOutputDims(x_dims, y_dims);
-  if (w_dims.size() > 1 || w_dims[0] != 1) {
-    out_dims = funcs::GetOutputDims(out_dims, w_dims);
-  }
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->share_lod(x);
-}
-
-void LinspaceInferMeta(const MetaTensor& start,
-                       const MetaTensor& stop,
-                       const MetaTensor& number,
-                       MetaTensor* out) {
-  auto s_dims = start.dims();
-  PADDLE_ENFORCE_EQ(
-      (s_dims.size() == 1) && (s_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
-                                   "but received input shape is [%s].",
-                                   s_dims));
-  auto e_dims = stop.dims();
-  PADDLE_ENFORCE_EQ(
-      (e_dims.size() == 1) && (e_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
-                                   "but received input shape is [%s].",
-                                   e_dims));
-  auto step_dims = number.dims();
-  PADDLE_ENFORCE_EQ(
-      (step_dims.size() == 1) && (step_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                   "but received input shape is [%s].",
-                                   step_dims));
-  out->set_dims(phi::make_ddim({-1}));
-  out->set_dtype(start.dtype());
-}
-
-void AccuracyInferMeta(const MetaTensor& out,
-                       const MetaTensor& indice,
-                       const MetaTensor& label,
-                       MetaTensor* accuracy,
-                       MetaTensor* correct,
-                       MetaTensor* total,
-                       MetaConfig config) {
-  auto inference_dim = out.dims();
-  auto label_dim = label.dims();
-  // Assume indices has same shape as inference, because
-  // it's the output of topk.
-  PADDLE_ENFORCE_EQ(
-      label_dim.size(),
-      2,
-      phi::errors::InvalidArgument(
-          "ShapeError: label's dimensions of AccuracyOp must be 2. "
-          "But received label's dimensions = %d, label's shape = [%s]",
-          label_dim.size(),
-          label_dim));
-  if (config.is_runtime) {
-    PADDLE_ENFORCE_EQ(label_dim[1],
-                      1,
-                      phi::errors::InvalidArgument(
-                          "ShapeError: label's second dimension of "
-                          "AccuracyOp must be 1. But received label's "
-                          "second dimension is = %d, label's shape = [%s]",
-                          label_dim[1],
-                          label_dim));
-    PADDLE_ENFORCE_EQ(
-        inference_dim[0],
-        label_dim[0],
-        phi::errors::InvalidArgument(
-            "ShapeError: the output's num_rows of AccuracyOp must be"
-            " the same as label's num_rows. But received output's "
-            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
-            "label's "
-            "num_rows = %d",
-            inference_dim,
-            label_dim,
-            inference_dim[0],
-            label_dim[0]));
-  }
-
-  accuracy->set_dims({1});
-  accuracy->set_dtype(out.dtype());
-  correct->set_dims({1});
-  correct->set_dtype(out.dtype());
-  total->set_dims({1});
-  total->set_dtype(out.dtype());
-  accuracy->share_lod(out);
-}
-
-void GraphSendRecvInferMeta(const MetaTensor& x,
-                            const MetaTensor& src_index,
-                            const MetaTensor& dst_index,
-                            const std::string& pool_type,
-                            MetaTensor* out,
-                            MetaTensor* dst_count) {
-  auto src_index_dims = src_index.dims();
-  if (src_index_dims.size() == 2) {
-    PADDLE_ENFORCE_EQ(src_index_dims[1],
-                      1,
-                      phi::errors::InvalidArgument(
-                          "The last dim of Src_index should be 1 when it "
-                          "is 2D, but we get %d",
-                          src_index_dims[1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        src_index_dims.size(),
-        1,
-        phi::errors::InvalidArgument(
-            "The Src_index should be 1D, when it is not 2D, but we get %d",
-            src_index_dims.size()));
-  }
-
-  auto dst_index_dims = dst_index.dims();
-  if (dst_index_dims.size() == 2) {
-    PADDLE_ENFORCE_EQ(dst_index_dims[1],
-                      1,
-                      phi::errors::InvalidArgument(
-                          "The last dim of Dst_index should be 1 when it "
-                          "is 2D, but we get %d",
-                          dst_index_dims[1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        dst_index_dims.size(),
-        1,
-        phi::errors::InvalidArgument("The Dst_index should be 1D, "
-                                     "when it is not 2D, but we get %d",
-                                     dst_index_dims.size()));
-  }
-
-  PADDLE_ENFORCE_EQ(src_index_dims[0],
-                    dst_index_dims[0],
-                    phi::errors::InvalidArgument(
-                        "Src_index and Dst_index should have the same shape."));
-
-  auto dims = x.dims();
-  out->set_dims(dims);
-  out->set_dtype(x.dtype());
-
-  if (pool_type == "MEAN") {
-    dst_count->set_dims({dims[0]});
-    dst_count->set_dtype(DataType::INT32);
-  }
-}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index c9a7e78db752f95c7e38857e3f1075a0d672246b..42a0f35dc1d8d6aef13b631d355a4cee951a4ed1 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -30,6 +30,8 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 //
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
 void AccuracyInferMeta(const MetaTensor& out,
                        const MetaTensor& indice,
                        const MetaTensor& label,
@@ -45,16 +47,22 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
-void GatherNdGradInferMeta(const MetaTensor& x,
-                           const MetaTensor& index,
-                           const MetaTensor& out_grad,
-                           MetaTensor* x_grad);
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
 
-void ScatterInferMeta(const MetaTensor& x,
-                      const MetaTensor& index,
-                      const MetaTensor& updates,
-                      bool overwrite,
-                      MetaTensor* out);
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out);
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
 
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
@@ -65,6 +73,32 @@ void NllLossRawInferMeta(const MetaTensor& input,
                          MetaTensor* total_weight,
                          MetaConfig config = MetaConfig());
 
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max);
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out);
+
 void ScatterNdAddInferMeta(const MetaTensor& x,
                            const MetaTensor& index,
                            const MetaTensor& updates,
@@ -78,20 +112,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input,
                             MetaTensor* path,
                             MetaConfig config = MetaConfig());
 
-void LerpInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   const MetaTensor& weight,
-                   MetaTensor* out);
-
-void LinspaceInferMeta(const MetaTensor& start,
-                       const MetaTensor& stop,
-                       const MetaTensor& number,
-                       MetaTensor* out);
-
-void GraphSendRecvInferMeta(const MetaTensor& x,
-                            const MetaTensor& src_index,
-                            const MetaTensor& dst_index,
-                            const std::string& pool_type,
-                            MetaTensor* out,
-                            MetaTensor* dst_count);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index d6d4efad9fae26e6cbdb914752d8c24ab23d948c..0f51839553158b6dce7ac90006c5c72ee8e3b57b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -22,10 +22,87 @@ limitations under the License. */
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
+
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
+    }
+  }
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
+  }
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void ArgsortInferMeta(const MetaTensor& input,
                       int axis,
                       bool descending,
@@ -54,96 +131,6 @@ void ArgsortInferMeta(const MetaTensor& input,
   indices->share_lod(input);
 }
 
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->share_meta(x);
-}
-
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out) {
-  auto rank = x.dims().size();
-  PADDLE_ENFORCE_GE(
-      axis,
-      -rank,
-      errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
-  PADDLE_ENFORCE_LT(
-      axis,
-      rank,
-      phi::errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
-  out->share_meta(x);
-}
-
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(dtype::ToReal(x.dtype()));
-  out->set_layout(x.layout());
-}
-
-void FlattenInferMeta(const MetaTensor& x,
-                      int start_axis,
-                      int stop_axis,
-                      MetaTensor* out) {
-  auto x_dims = x.dims();
-  int in_dims_size = x_dims.size();
-  if (start_axis < 0) {
-    start_axis = start_axis + in_dims_size;
-  }
-  if (stop_axis < 0) {
-    stop_axis = stop_axis + in_dims_size;
-  }
-  PADDLE_ENFORCE_GE(
-      stop_axis,
-      start_axis,
-      phi::errors::InvalidArgument("The stop_axis should be greater"
-                                   "than or equal to start_axis."));
-
-  int64_t outer = 1;
-  std::vector<int32_t> out_shape;
-  out_shape.reserve(in_dims_size - stop_axis + start_axis);
-
-  for (int i = 0; i < start_axis; ++i) {
-    out_shape.push_back(x_dims[i]);
-  }
-  for (int i = start_axis; i <= stop_axis; i++) {
-    if (x_dims[i] == -1 || outer == -1) {
-      outer = -1;
-    } else {
-      outer *= x_dims[i];
-    }
-  }
-  out_shape.push_back(outer);
-  for (int i = stop_axis + 1; i < in_dims_size; i++) {
-    out_shape.push_back(x_dims[i]);
-  }
-  const auto& out_dims = phi::make_ddim(out_shape);
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-
-  if (x_dims[0] == out_dims[0]) {
-    // Only pass LoD when the first dimension of output and Input(X)
-    // are the same.
-    out->share_lod(x);
-  }
-}
-
-void GumbelSoftmaxInferMeta(const MetaTensor& x,
-                            float temperature,
-                            bool hard,
-                            int axis,
-                            MetaTensor* out) {
-  UnchangedInferMetaCheckAxis(x, axis, out);
-}
-
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(out_dtype);
@@ -203,73 +190,306 @@ void CumsumInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
-void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      product(x.dims()),
-      1UL,
-      errors::InvalidArgument("The number of elements in Input(X) should be 1."
-                              "Now the number is %d.",
-                              product(x.dims())));
-  out->set_dims(x.dims());
-  out->share_lod(x);
-  out->set_dtype(x.dtype());
-}
-
-static phi::DDim ValidateShape(const std::vector<int64_t> shape,
-                               const phi::DDim& in_dims) {
-  const int64_t in_size = phi::product(in_dims);
-  auto in_dims_vec = phi::vectorize(in_dims);
-  bool all_positive = std::all_of(in_dims_vec.cbegin(),
-                                  in_dims_vec.cend(),
-                                  [](int64_t i) { return i > 0; });
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int64_t unk_dim_val = -1;
-  const int64_t copy_dim_val = 0;
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
 
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  int64_t capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_ENFORCE_EQ(
-          unk_dim_idx,
-          -1,
-          phi::errors::InvalidArgument(
-              "Only one dimension value of 'shape' in ReshapeOp can "
-              "be -1. But received shape = [%s], shape[%d] is also -1.",
-              phi::make_ddim(shape),
-              i));
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      PADDLE_ENFORCE_LT(
-          static_cast<int>(i),
-          in_dims.size(),
-          phi::errors::InvalidArgument(
-              "The index of 0 in `shape` must be less than "
-              "the input tensor X's dimensions. "
-              "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
-              "X's dimensions = %d.",
-              phi::make_ddim(shape),
-              i,
-              in_dims,
-              in_dims.size()));
+  if (x_dims.size() == 1UL) {
+    int64_t size_ = x_dims[0] + std::abs(offset);
+    out->set_dims({size_, size_});
+    out->set_dtype(x.dtype());
+  } else if (x_dims.size() == 2UL) {
+    int64_t size_ = 0;
+    if (offset >= 0) {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] < x_dims[1] - offset) {
+        size_ = x_dims[0];
+      } else {
+        size_ = x_dims[1] - offset;
+      }
     } else {
-      PADDLE_ENFORCE_GT(
-          shape[i],
-          0,
-          phi::errors::InvalidArgument(
-              "Each dimension value of 'shape' in ReshapeOp must not "
-              "be negative except one unknown dimension. "
-              "But received  shape = [%s], shape[%d] = %d.",
-              phi::make_ddim(shape),
-              i,
-              shape[i]));
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] + offset < x_dims[1]) {
+        size_ = x_dims[0] + offset;
+      } else {
+        size_ = x_dims[1];
+      }
     }
-
-    // NOTE all non-zero values will be converted to True (include negative
-    // value)
-    capacity *= (shape[i] ? shape[i] : in_dims[i]);
+    out->set_dims({size_});
+    out->set_dtype(x.dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+        "2, but received %d.",
+        x_dims.size()));
+  }
+}
+
+void DiagonalInferMeta(const MetaTensor& input,
+                       int offset,
+                       int axis1,
+                       int axis2,
+                       MetaTensor* out) {
+  auto x_dims = input.dims();
+  int offset_ = offset;
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange("Input's dim is out of range (expected at "
+                              "least 2 dimensions, but got %ld).",
+                              x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis1));
+  PADDLE_ENFORCE_LT(
+      axis2_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis2) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis2));
+  PADDLE_ENFORCE_NE(
+      axis1_,
+      axis2_,
+      phi::errors::InvalidArgument("The dimensions should not be identical "
+                                   "%d vs %d.",
+                                   axis1,
+                                   axis2));
+
+  auto out_dims = vectorize(x_dims);
+  // from out_dims get the dim size of axis1_.
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  // delete two dims by attr axis1 and axis2 from out_dims.
+  /* example:
+     out_dim = [2, 3, 4];
+     axis1 = 0;
+     axis2 = 1;
+     according to the attr of axis1 and axis2, we get:
+     out_dim = [4].
+  */
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  if (offset_ == 0) {
+    out_dims.push_back(std::min(axis1_size, axis2_size));
+  } else if (offset_ > 0) {
+    if ((axis2_size - offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
+    } else {
+      out_dims.push_back(0);
+    }
+  } else {
+    if ((axis1_size + offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
+    } else {
+      out_dims.push_back(0);
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) {
+  auto x_dims = x.dims();
+  out->set_dims(x_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  if (mask != nullptr) {
+    mask->set_dims(x_dims);
+  }
+}
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out) {
+  auto x_dims = x.dims();
+  int in_dims_size = x_dims.size();
+  if (start_axis < 0) {
+    start_axis = start_axis + in_dims_size;
+  }
+  if (stop_axis < 0) {
+    stop_axis = stop_axis + in_dims_size;
+  }
+  PADDLE_ENFORCE_GE(
+      stop_axis,
+      start_axis,
+      phi::errors::InvalidArgument("The stop_axis should be greater"
+                                   "than or equal to start_axis."));
+
+  int64_t outer = 1;
+  std::vector<int32_t> out_shape;
+  out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+  for (int i = 0; i < start_axis; ++i) {
+    out_shape.push_back(x_dims[i]);
+  }
+  for (int i = start_axis; i <= stop_axis; i++) {
+    if (x_dims[i] == -1 || outer == -1) {
+      outer = -1;
+    } else {
+      outer *= x_dims[i];
+    }
+  }
+  out_shape.push_back(outer);
+  for (int i = stop_axis + 1; i < in_dims_size; i++) {
+    out_shape.push_back(x_dims[i]);
+  }
+  const auto& out_dims = phi::make_ddim(out_shape);
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+}
+
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out) {
+  UnchangedInferMetaCheckAxis(x, axis, out);
+}
+
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) {
+  PADDLE_ENFORCE_GE(bins,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The bins should be greater than or equal to 1."
+                        "But received nbins is %d",
+                        bins));
+  PADDLE_ENFORCE_GE(
+      max,
+      min,
+      phi::errors::InvalidArgument("max must be larger or equal to min."
+                                   "But received max is %d, min is %d",
+                                   max,
+                                   min));
+
+  out->set_dims({bins});
+  out->share_lod(input);
+}
+
+void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      product(x.dims()),
+      1UL,
+      errors::InvalidArgument("The number of elements in Input(X) should be 1."
+                              "Now the number is %d.",
+                              product(x.dims())));
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+static phi::DDim ValidateShape(const std::vector<int64_t> shape,
+                               const phi::DDim& in_dims) {
+  const int64_t in_size = phi::product(in_dims);
+  auto in_dims_vec = phi::vectorize(in_dims);
+  bool all_positive = std::all_of(in_dims_vec.cbegin(),
+                                  in_dims_vec.cend(),
+                                  [](int64_t i) { return i > 0; });
+  // only one dimension can be set to -1, whose size will be automatically
+  // infered.
+  const int64_t unk_dim_val = -1;
+  const int64_t copy_dim_val = 0;
+
+  std::vector<int64_t> output_shape(shape.size(), 0);
+  int64_t capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == unk_dim_val) {
+      PADDLE_ENFORCE_EQ(
+          unk_dim_idx,
+          -1,
+          phi::errors::InvalidArgument(
+              "Only one dimension value of 'shape' in ReshapeOp can "
+              "be -1. But received shape = [%s], shape[%d] is also -1.",
+              phi::make_ddim(shape),
+              i));
+      unk_dim_idx = i;
+    } else if (shape[i] == copy_dim_val) {
+      PADDLE_ENFORCE_LT(
+          static_cast<int>(i),
+          in_dims.size(),
+          phi::errors::InvalidArgument(
+              "The index of 0 in `shape` must be less than "
+              "the input tensor X's dimensions. "
+              "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
+              "X's dimensions = %d.",
+              phi::make_ddim(shape),
+              i,
+              in_dims,
+              in_dims.size()));
+    } else {
+      PADDLE_ENFORCE_GT(
+          shape[i],
+          0,
+          phi::errors::InvalidArgument(
+              "Each dimension value of 'shape' in ReshapeOp must not "
+              "be negative except one unknown dimension. "
+              "But received  shape = [%s], shape[%d] = %d.",
+              phi::make_ddim(shape),
+              i,
+              shape[i]));
+    }
+
+    // NOTE all non-zero values will be converted to True (include negative
+    // value)
+    capacity *= (shape[i] ? shape[i] : in_dims[i]);
     output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
   }
 
@@ -360,10 +580,213 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->set_dtype(DataType::BOOL);
 }
 
-void MultinomialInferMeta(const MetaTensor& x,
-                          int num_samples,
-                          bool replacement,
-                          MetaTensor* out) {
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_LT(axis,
+                    dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  PADDLE_ENFORCE_GE(axis,
+                    -dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  if (axis < 0) axis += dim_size;
+  PADDLE_ENFORCE_GE(
+      k,
+      1,
+      phi::errors::InvalidArgument(
+          "the k in the kthvalue must >= 1, but received %d .", k));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GE(
+        input_dims[axis],
+        k,
+        phi::errors::InvalidArgument(
+            "input of kthvalue must have >= %d columns in axis of %d",
+            k,
+            axis));
+  }
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
+  auto dims = x.dims();
+  auto n_dim = dims.size();
+  PADDLE_ENFORCE_GE(n_dim,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        n_dim));
+  PADDLE_ENFORCE_EQ(dims[n_dim - 2],
+                    dims[n_dim - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        dims[n_dim - 2],
+                        dims[n_dim - 1]));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+
+  PADDLE_ENFORCE(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      errors::InvalidArgument(
+          "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor",
+          x_dims.size()));
+
+  if (global_pooling) {
+    kernel_size_.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x_dims[i + 2]);
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() - kernel_size_.size(),
+      2U,
+      errors::InvalidArgument(
+          "The input size %d minus the kernel size %d should equal to 2.",
+          x_dims.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "Strides size %d and pooling size %d should be the same.",
+          strides.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      paddings_.size(),
+      errors::InvalidArgument(
+          "Paddings size %d and pooling size %d should be the same.",
+          paddings_.size(),
+          kernel_size_.size()));
+
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      if ((!config.is_runtime) && (x_dims[i + 2] < 0)) {
+        output_shape.push_back(x_dims[i + 2]);
+      } else {
+        output_shape.push_back(funcs::MaxPoolOutputSize(
+            x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i]));
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(x.dtype());
+
+  mask->set_dims(make_ddim(output_shape));
+  mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      errors::InvalidArgument(
+          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+  if (axis < 0) axis += dim_size;
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  PADDLE_ENFORCE_GE(input_dims.size(),
+                    1,
+                    errors::InvalidArgument("input shape should >= 1d"));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
+void MultinomialInferMeta(const MetaTensor& x,
+                          int num_samples,
+                          bool replacement,
+                          MetaTensor* out) {
   auto x_dim = x.dims();
   int64_t x_rank = x_dim.size();
   PADDLE_ENFORCE_GT(x_rank,
@@ -395,124 +818,227 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
-void TileInferMeta(const MetaTensor& x,
-                   const ScalarArray& repeat_times,
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
                    MetaTensor* out,
-                   MetaConfig config) {
-#define MAX_RANK_SUPPORTED 6
+                   MetaTensor* norm) {
+  auto xdim = x.dims();
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
 
-  auto repeat_times_data = repeat_times.GetData();
-  auto x_dims = x.dims();
-  if (repeat_times_data.size() == 0) {
-    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  if (is_test == false) {
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    norm->set_dims(xdim);
+    norm->set_dtype(x.dtype());
   }
+}
 
-  PADDLE_ENFORCE_LE(
-      x_dims.size(),
-      MAX_RANK_SUPPORTED,
-      errors::InvalidArgument(
-          "The rank of the input 'x' for tile op "
-          "must not be greater than %d, but the value received is %d.",
-          MAX_RANK_SUPPORTED,
-          x_dims.size()));
-  PADDLE_ENFORCE_LE(
-      repeat_times_data.size(),
-      MAX_RANK_SUPPORTED,
-      errors::InvalidArgument(
-          "The size of the shape of input 'repeat_times' for tile op "
-          "must not be greater than %d, but the value received is %d.",
-          MAX_RANK_SUPPORTED,
-          repeat_times_data.size()));
-  PADDLE_ENFORCE_GE(
-      repeat_times_data.size(),
-      1,
-      errors::InvalidArgument(
-          "The size of the shape of input 'repeat_times' for tile op "
-          "must be positive integers, but the value received is %d.",
-          repeat_times_data.size()));
-
-  auto out_rank =
-      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
-  std::vector<int64_t> out_shape(out_rank);
-  auto x_dim_vec = phi::vectorize<int>(x_dims);
-  if (x_dim_vec.size() > repeat_times_data.size()) {
-    auto diff = x_dim_vec.size() - repeat_times_data.size();
-    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
-  } else {
-    auto diff = repeat_times_data.size() - x_dim_vec.size();
-    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config) {
+  auto x_dim = input.dims();
+  PADDLE_ENFORCE_EQ(
+      static_cast<int>(paddings.size()),
+      x_dim.size() * 2,
+      phi::errors::InvalidArgument(
+          "Size of 'paddings' dimension should be equal to 2 * size of "
+          "Input(X)'s dimension, but received (size of 'paddings' dimension "
+          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
+          static_cast<int>(paddings.size()),
+          x_dim.size() * 2));
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_GE(paddings[i],
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The element of 'paddings' should >= 0, but "
+                          "received %d for index %d.",
+                          paddings[i],
+                          static_cast<int>(i)));
   }
-  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
-    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
-      out_shape[i] = -1;
+  std::vector<int64_t> out_dims(x_dim.size());
+  for (int i = 0; i < x_dim.size(); ++i) {
+    if ((!config.is_runtime) && (x_dim[i] == -1)) {
+      out_dims[i] = -1;
     } else {
-      PADDLE_ENFORCE_GT(
-          repeat_times_data[i],
-          0,
-          errors::InvalidArgument(
-              "Every element of the input 'repeat_times' for tile op must be "
-              "greater than 0, but the value given is %d.",
-              repeat_times_data[i]));
-      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
   }
-
-  out->set_dims(phi::make_ddim(out_shape));
-  if (out_shape[0] == x_dims[0]) {
-    out->share_lod(x);
+  out->set_dims(phi::make_ddim(out_dims));
+  if (out_dims[0] == x_dim[0]) {
+    // Only pass LoD when the first dimension is equal between
+    // output and input.
+    out->share_lod(input);
   }
+  out->set_dtype(input.dtype());
 }
 
-void ReshapeInferMeta(const MetaTensor& x,
-                      const ScalarArray& shape,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto& shape_data = shape.GetData();
-  PADDLE_ENFORCE_NOT_NULL(out,
-                          phi::errors::InvalidArgument(
-                              "Output(Out) of ReshapeOp should not be null."));
-  if (!config.is_runtime && shape.FromTensor()) {
-    out->set_dims(phi::make_ddim(shape_data));
-    out->share_lod(x);
-    return;
-  }
-  PADDLE_ENFORCE_GT(shape_data.size(),
-                    0,
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
                     phi::errors::InvalidArgument(
-                        "The shape's size in ReshapeOp can't be zero."));
-  InferMetaFromVecValue(x, shape_data, out);
-}
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
 
-void ReshapeWithXShapeInferMeta(const MetaTensor& x,
-                                const ScalarArray& shape,
-                                MetaTensor* xshape,
-                                MetaTensor* out,
-                                MetaConfig config) {
-  PADDLE_ENFORCE_NOT_NULL(
-      xshape,
-      phi::errors::InvalidArgument(
-          "Output(XShape) of ReshapeOp should not be null."));
-  const auto& x_dims = x.dims();
-  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    xshape_dims[i + 1] = x_dims[i];
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[3]));
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
-  xshape->share_lod(x);
-  ReshapeInferMeta(x, shape, out, config);
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] * upscale_factor;
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
 }
 
-/*  Why not use SumRawInferMeta directly?
-    Because we need make InferMetaFunction's args follow the design of api.yaml
-*/
-void SumInferMeta(const MetaTensor& x,
-                  const std::vector<int64_t>& axis,
-                  DataType dtype,
-                  bool keep_dim,
-                  MetaTensor* out) {
-  bool reduce_all = false;
-  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      true,
+      errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          x_dims.size(),
+          x_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(),
+                    2U,
+                    errors::InvalidArgument(
+                        "the dimension of input minus the size of "
+                        "Attr(kernel_size_) must be euqal to 2 in Op(pool). "
+                        "But received: the dimension of input minus the size "
+                        "of Attr(kernel_size_) is %d, the "
+                        "input's dimension is %d, the shape of input "
+                        "is [%s], the Attr(kernel_size_)'s size is %d, the "
+                        "Attr(kernel_size_) is [%s].",
+                        x_dims.size() - kernel_size_.size(),
+                        x_dims.size(),
+                        x_dims,
+                        kernel_size_.size(),
+                        make_ddim(kernel_size_)));
+
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "the size of Attr(kernel_size_) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].",
+          kernel_size_.size(),
+          strides.size(),
+          make_ddim(kernel_size_),
+          make_ddim(strides)));
+
+  // MKL-DNN Kernels are using NCHW order of dims description
+  // so we ignore data_format consideration for MKL-DNN kernel
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings if "SAME" or global_pooling
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  std::vector<int64_t> output_shape;
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      if ((!config.is_runtime) && (data_dims[i] < 0)) {
+        output_shape.push_back(data_dims[i]);
+      } else {
+        output_shape.push_back(funcs::PoolOutputSize(data_dims[i],
+                                                     kernel_size_[i],
+                                                     paddings_[2 * i],
+                                                     paddings_[2 * i + 1],
+                                                     strides[i],
+                                                     ceil_mode));
+      }
+    }
+  }
+
+  // output_N = input_N
+  output_shape.insert(output_shape.begin(), x_dims[0]);
+  // output_C = input_C
+  if (channel_last) {
+    output_shape.push_back(x_dims[x_dims.size() - 1]);
+  } else {
+    output_shape.insert(output_shape.begin() + 1, x_dims[1]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
 }
 
 DDim ReduceInferDim(const MetaTensor& x,
@@ -584,29 +1110,12 @@ DDim ReduceInferDim(const MetaTensor& x,
   return out_dim;
 }
 
-void SumRawInferMeta(const MetaTensor& x,
+void ReduceInferMeta(const MetaTensor& x,
                      const std::vector<int64_t>& axis,
                      bool keep_dim,
-                     bool reduce_all,
-                     DataType dtype,
                      MetaTensor* out) {
-  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
-
-  DataType out_dtype;
-  if (dtype != DataType::UNDEFINED) {
-    out_dtype = dtype;
-  } else {
-    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
-        x.dtype() == DataType::INT64) {
-      out_dtype = DataType::INT64;
-    } else {
-      out_dtype = x.dtype();
-    }
-  }
-
-  out->set_dims(out_dim);
-  out->set_dtype(out_dtype);
-  out->set_layout(x.layout());
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
 }
 
 void ReduceInferMetaBase(const MetaTensor& x,
@@ -620,20 +1129,143 @@ void ReduceInferMetaBase(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out) {
-  bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto& shape_data = shape.GetData();
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "Output(Out) of ReshapeOp should not be null."));
+  if (!config.is_runtime && shape.FromTensor()) {
+    out->set_dims(phi::make_ddim(shape_data));
+    out->share_lod(x);
+    return;
+  }
+  PADDLE_ENFORCE_GT(shape_data.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The shape's size in ReshapeOp can't be zero."));
+  InferMetaFromVecValue(x, shape_data, out);
 }
 
-void TransferLayoutInferMeta(const MetaTensor& x,
-                             DataLayout layout,
-                             MetaTensor* out) {
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      xshape,
+      phi::errors::InvalidArgument(
+          "Output(XShape) of ReshapeOp should not be null."));
+  const auto& x_dims = x.dims();
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  ReshapeInferMeta(x, shape, out, config);
+}
+
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
+
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
+  }
+
   out->set_dims(x.dims());
+  out->share_lod(x);
   out->set_dtype(x.dtype());
-  out->set_layout(layout);
+}
+
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
+  auto in_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      in_dims.size(),
+      7,
+      phi::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.",
+          in_dims.size()));
+}
+
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  auto in_dim = input.dims();
+  out->set_dims(phi::make_ddim({in_dim.size()}));
+  out->set_dtype(DataType::INT32);
+}
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  auto x_dims = in.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
+                                   "but the value given is %d.",
+                                   x_dims.size()));
+  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
+    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
+                      1U,
+                      phi::errors::InvalidArgument(
+                          "The last dimension of Input(X) should be 1, "
+                          "but the value given is %d.",
+                          x_dims[x_dims.size() - 1]));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(in);
+  out->set_dtype(in.dtype());
+}
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  out->set_dtype(DataType::INT64);
+  out->set_dims({1});
+}
+
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto rank_x = dim_x.size();
+  PADDLE_ENFORCE_GE(axis,
+                    -rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
+  PADDLE_ENFORCE_LT(axis,
+                    rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
+
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
 }
 
 void SplitInferMeta(const MetaTensor& x,
@@ -767,52 +1399,187 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
-void UnbindInferMeta(const MetaTensor& x,
-                     int axis,
-                     std::vector<MetaTensor>* outs) {
-  auto in_dims = x.dims();
-  std::vector<int> out_dim;
-  axis = axis < 0 ? in_dims.size() + axis : axis;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i != axis) out_dim.push_back(in_dims[i]);
-  }
-  auto out_dims = phi::make_ddim(out_dim);
+/*  Why not use SumRawInferMeta directly?
+    Because we need make InferMetaFunction's args follow the design of api.yaml
+*/
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out) {
+  bool reduce_all = false;
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
+}
 
-  for (size_t i = 0; i < outs->size(); ++i) {
-    (*outs)[i].set_dtype(x.dtype());
-    (*outs)[i].set_dims(out_dims);
-    (*outs)[i].set_layout(x.layout());
-    (*outs)[i].share_lod(x);
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
+  DataType out_dtype;
+  if (dtype != DataType::UNDEFINED) {
+    out_dtype = dtype;
+  } else {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
+        x.dtype() == DataType::INT64) {
+      out_dtype = DataType::INT64;
+    } else {
+      out_dtype = x.dtype();
+    }
   }
+
+  out->set_dims(out_dim);
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
 }
 
-void TraceInferMeta(
-    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
-  int dim1 = axis1;
-  int dim2 = axis2;
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config) {
+#define MAX_RANK_SUPPORTED 6
 
+  auto repeat_times_data = repeat_times.GetData();
   auto x_dims = x.dims();
+  if (repeat_times_data.size() == 0) {
+    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  }
 
-  int dim1_ = dim1 < 0 ? x_dims.size() + dim1 : dim1;
-  int dim2_ = dim2 < 0 ? x_dims.size() + dim2 : dim2;
-
-  PADDLE_ENFORCE_GE(
+  PADDLE_ENFORCE_LE(
       x_dims.size(),
-      2,
-      phi::errors::OutOfRange(
-          "Input's dim is out of range (expected at least 2, but got %ld).",
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
           x_dims.size()));
-  PADDLE_ENFORCE_LT(
-      dim1_,
-      x_dims.size(),
-      phi::errors::OutOfRange(
-          "Attr(dim1) is out of range (expected to be in range of [%ld, "
-          "%ld], but got %ld).",
-          -(x_dims.size()),
-          (x_dims.size() - 1),
-          dim1));
-  PADDLE_ENFORCE_LT(
-      dim2_,
+  PADDLE_ENFORCE_LE(
+      repeat_times_data.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_data.size()));
+  PADDLE_ENFORCE_GE(
+      repeat_times_data.size(),
+      1,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must be positive integers, but the value received is %d.",
+          repeat_times_data.size()));
+
+  auto out_rank =
+      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
+  std::vector<int64_t> out_shape(out_rank);
+  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  if (x_dim_vec.size() > repeat_times_data.size()) {
+    auto diff = x_dim_vec.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
+  } else {
+    auto diff = repeat_times_data.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+  }
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
+      out_shape[i] = -1;
+    } else {
+      PADDLE_ENFORCE_GT(
+          repeat_times_data[i],
+          0,
+          errors::InvalidArgument(
+              "Every element of the input 'repeat_times' for tile op must be "
+              "greater than 0, but the value given is %d.",
+              repeat_times_data[i]));
+      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_shape));
+  if (out_shape[0] == x_dims[0]) {
+    out->share_lod(x);
+  }
+}
+
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      phi::errors::InvalidArgument(
+          "the axis of topk must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+
+  if (axis < 0) axis += dim_size;
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    k = -1;
+  } else {
+    PADDLE_ENFORCE_EQ(k >= 1,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "the attribute of k in the topk must >= 1 or be a "
+                          "Tensor, but received %d .",
+                          k));
+  }
+
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of topk must have >= 1d shape"));
+
+  phi::DDim dims = input_dims;
+
+  dims[axis] = k;
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(DataType::INT64);
+}
+
+void TraceInferMeta(
+    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
+  int dim1 = axis1;
+  int dim2 = axis2;
+
+  auto x_dims = x.dims();
+
+  int dim1_ = dim1 < 0 ? x_dims.size() + dim1 : dim1;
+  int dim2_ = dim2 < 0 ? x_dims.size() + dim2 : dim2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange(
+          "Input's dim is out of range (expected at least 2, but got %ld).",
+          x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      dim1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(dim1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          dim1));
+  PADDLE_ENFORCE_LT(
+      dim2_,
       x_dims.size(),
       phi::errors::OutOfRange(
           "Attr(dim2) is out of range (expected to be in range of [%ld, "
@@ -837,116 +1604,150 @@ void TraceInferMeta(
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
   out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
 }
 
-void DiagonalInferMeta(const MetaTensor& input,
-                       int offset,
-                       int axis1,
-                       int axis2,
-                       MetaTensor* out) {
-  auto x_dims = input.dims();
-  int offset_ = offset;
-  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
-  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+void TransferLayoutInferMeta(const MetaTensor& x,
+                             DataLayout layout,
+                             MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(layout);
+}
 
-  PADDLE_ENFORCE_GE(
-      x_dims.size(),
-      2,
-      phi::errors::OutOfRange("Input's dim is out of range (expected at "
-                              "least 2 dimensions, but got %ld).",
-                              x_dims.size()));
-  PADDLE_ENFORCE_LT(
-      axis1_,
-      x_dims.size(),
-      phi::errors::OutOfRange(
-          "Attr(axis1) is out of range (expected to be in range of [%ld, "
-          "%ld], but got %ld).",
-          -(x_dims.size()),
-          (x_dims.size() - 1),
-          axis1));
-  PADDLE_ENFORCE_LT(
-      axis2_,
-      x_dims.size(),
-      phi::errors::OutOfRange(
-          "Attr(axis2) is out of range (expected to be in range of [%ld, "
-          "%ld], but got %ld).",
-          -(x_dims.size()),
-          (x_dims.size() - 1),
-          axis2));
-  PADDLE_ENFORCE_NE(
-      axis1_,
-      axis2_,
-      phi::errors::InvalidArgument("The dimensions should not be identical "
-                                   "%d vs %d.",
-                                   axis1,
-                                   axis2));
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  size_t x_rank = x_dims.size();
+  size_t axis_size = axis.size();
 
-  auto out_dims = vectorize(x_dims);
-  // from out_dims get the dim size of axis1_.
-  auto axis1_size = out_dims[axis1_];
-  auto axis2_size = out_dims[axis2_];
-  // delete two dims by attr axis1 and axis2 from out_dims.
-  /* example:
-     out_dim = [2, 3, 4];
-     axis1 = 0;
-     axis2 = 1;
-     according to the attr of axis1 and axis2, we get:
-     out_dim = [4].
-  */
-  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
-  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+  PADDLE_ENFORCE_EQ(
+      x_rank,
+      axis_size,
+      errors::InvalidArgument("The input tensor's dimension "
+                              "should be equal to the axis's size. "
+                              "But received input tensor's dimension is %d, "
+                              "axis's size is %d",
+                              x_rank,
+                              axis_size));
 
-  if (offset_ == 0) {
-    out_dims.push_back(std::min(axis1_size, axis2_size));
-  } else if (offset_ > 0) {
-    if ((axis2_size - offset_) > 0) {
-      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
-    } else {
-      out_dims.push_back(0);
-    }
-  } else {
-    if ((axis1_size + offset_) > 0) {
-      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
-    } else {
-      out_dims.push_back(0);
-    }
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_ENFORCE_GE(
+        axis[i],
+        0,
+        errors::InvalidArgument("The axis should be greater than or equal to 0."
+                                "But received %d of axis[%d]",
+                                axis[i],
+                                i));
+
+    PADDLE_ENFORCE_EQ(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        true,
+        errors::InvalidArgument(
+            "Each element of Attribute axis should "
+            "be a unique value range from 0 to (dims - 1), "
+            "where the dims is the axis's size, "
+            "unique value means this axis value can appear only once. "
+            "But received axis[%d] is %d, axis_size is %d, "
+            "count[axis[%d]] is %d",
+            i,
+            axis[i],
+            axis_size,
+            i,
+            count[axis[i]]));
   }
-  out->set_dims(phi::make_ddim(out_dims));
+
+  phi::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; ++i) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
 }
 
-void UnfoldInferMeta(const MetaTensor& x,
-                     const std::vector<int>& kernel_sizes,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations,
-                     MetaTensor* out,
-                     MetaConfig config) {
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
   auto in_dims = x.dims();
-  // Only [N, C, H, W] input supported now
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      4,
-      phi::errors::InvalidArgument(
-          "Input should be 4-D tensor of format [N, C, H, W], but get %u",
-          in_dims.size()));
-  PADDLE_ENFORCE_EQ(
-      in_dims.size() - kernel_sizes.size(),
-      2U,
-      phi::errors::InvalidArgument(
-          "The dims of X should be larger than that of kernel_sizes "
-          "by a number of 2, due to the batch size and input channel dim. "
-          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
-          in_dims.size(),
-          kernel_sizes.size()));
-  PADDLE_ENFORCE_EQ(
-      strides.size(),
-      kernel_sizes.size(),
-      phi::errors::InvalidArgument(
-          "The dims of strides should be the same with that of kernel_sizes. "
-          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
-          strides.size(),
-          kernel_sizes.size()));
-  PADDLE_ENFORCE_EQ(
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
+  }
+  auto out_dims = phi::make_ddim(out_dim);
+
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->share_meta(x);
+}
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  out->share_meta(x);
+}
+
+void UnfoldInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  auto in_dims = x.dims();
+  // Only [N, C, H, W] input supported now
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "Input should be 4-D tensor of format [N, C, H, W], but get %u",
+          in_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() - kernel_sizes.size(),
+      2U,
+      phi::errors::InvalidArgument(
+          "The dims of X should be larger than that of kernel_sizes "
+          "by a number of 2, due to the batch size and input channel dim. "
+          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          in_dims.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
+      strides.size(),
+      kernel_sizes.size(),
+      phi::errors::InvalidArgument(
+          "The dims of strides should be the same with that of kernel_sizes. "
+          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          strides.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
       paddings.size(),
       2 * strides.size(),
       phi::errors::InvalidArgument(
@@ -1072,301 +1873,41 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out) {
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out) {
   auto x_dims = x.dims();
-
-  if (x_dims.size() == 1UL) {
-    int64_t size_ = x_dims[0] + std::abs(offset);
-    out->set_dims({size_, size_});
-    out->set_dtype(x.dtype());
-  } else if (x_dims.size() == 2UL) {
-    int64_t size_ = 0;
-    if (offset >= 0) {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] < x_dims[1] - offset) {
-        size_ = x_dims[0];
-      } else {
-        size_ = x_dims[1] - offset;
-      }
-    } else {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] + offset < x_dims[1]) {
-        size_ = x_dims[0] + offset;
-      } else {
-        size_ = x_dims[1];
-      }
-    }
-    out->set_dims({size_});
-    out->set_dtype(x.dtype());
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
-        "2, but received %d.",
-        x_dims.size()));
-  }
-}
-
-void ArgMinMaxInferMeta(const MetaTensor& x,
-                        int64_t axis,
-                        bool keepdims,
-                        bool flatten,
-                        int dtype,
-                        MetaTensor* out,
-                        MetaConfig config) {
-  const auto& x_dims = x.dims();
-
   PADDLE_ENFORCE_GE(
-      axis,
-      -x_dims.size(),
-      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
-                                   " -Rank(X)(%d).",
-                                   axis,
-                                   -x_dims.size()));
-  PADDLE_ENFORCE_LT(axis,
-                    x_dims.size(),
-                    phi::errors::InvalidArgument(
-                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
-                        axis,
-                        x_dims.size()));
-
-  PADDLE_ENFORCE_EQ(
-      (dtype < 0 || dtype == 2 || dtype == 3),
-      true,
-      phi::errors::InvalidArgument(
-          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-          "received [%s]",
-          paddle::framework::DataTypeToString(
-              paddle::framework::proto::VarType::INT32),
-          paddle::framework::DataTypeToString(
-              paddle::framework::proto::VarType::INT64),
-          paddle::framework::DataTypeToString(
-              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
-
-  auto x_rank = x_dims.size();
-  if (axis < 0) axis += x_rank;
-  if (config.is_runtime) {
-    if (dtype == paddle::framework::proto::VarType::INT32) {
-      int64_t all_element_num = 0;
-      if (flatten) {
-        all_element_num = phi::product(x_dims);
-
-      } else {
-        all_element_num = x_dims[axis];
-      }
-      PADDLE_ENFORCE_LE(
-          all_element_num,
-          INT_MAX,
-          phi::errors::InvalidArgument(
-              "The element num of the argmin/argmax input at axis is "
-              "%d, is larger than int32 maximum value:%d, you must "
-              "set the dtype of argmin/argmax to 'int64'.",
-              all_element_num,
-              INT_MAX));
-    }
-  }
-  std::vector<int64_t> vec;
-  if (flatten) {
-    vec.emplace_back(static_cast<int64_t>(1));
-  } else {
-    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-    if (keepdims) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-  }
-  out->set_dims(phi::make_ddim(vec));
-  if (dtype == 2) {
-    out->set_dtype(DataType::INT32);
-  } else if (dtype == 3) {
-    out->set_dtype(DataType::INT64);
-  }
-}
-
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
-  out->set_dtype(DataType::INT64);
-  out->set_dims({1});
-}
-
-void PadInferMeta(const MetaTensor& input,
-                  const std::vector<int>& paddings,
-                  float pad_value,
-                  MetaTensor* out,
-                  MetaConfig config) {
-  auto x_dim = input.dims();
-  PADDLE_ENFORCE_EQ(
-      static_cast<int>(paddings.size()),
-      x_dim.size() * 2,
-      phi::errors::InvalidArgument(
-          "Size of 'paddings' dimension should be equal to 2 * size of "
-          "Input(X)'s dimension, but received (size of 'paddings' dimension "
-          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
-          static_cast<int>(paddings.size()),
-          x_dim.size() * 2));
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_GE(paddings[i],
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The element of 'paddings' should >= 0, but "
-                          "received %d for index %d.",
-                          paddings[i],
-                          static_cast<int>(i)));
-  }
-  std::vector<int64_t> out_dims(x_dim.size());
-  for (int i = 0; i < x_dim.size(); ++i) {
-    if ((!config.is_runtime) && (x_dim[i] == -1)) {
-      out_dims[i] = -1;
-    } else {
-      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-    }
-  }
-  out->set_dims(phi::make_ddim(out_dims));
-  if (out_dims[0] == x_dim[0]) {
-    // Only pass LoD when the first dimension is equal between
-    // output and input.
-    out->share_lod(input);
-  }
-  out->set_dtype(input.dtype());
-}
-
-void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(DataType::BOOL);
-}
-
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out) {
-  auto input_dims = x.dims();
-  PADDLE_ENFORCE_EQ(input_dims.size(),
-                    4,
-                    phi::errors::InvalidArgument(
-                        "Input should be a 4-D tensor of format [N, C, H, W] "
-                        "or [N, H, W, C], but got %u.",
-                        input_dims.size()));
-
-  const bool channel_last = (data_format == "NHWC");
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
-  if (!channel_last) {
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[1]));
-  } else {
-    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[3]));
-  }
-  auto output_dims = input_dims;
-  output_dims[0] = input_dims[0];
-  if (!channel_last) {
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
-  } else {
-    output_dims[1] = input_dims[1] * upscale_factor;
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
-  }
-  out->set_dtype(x.dtype());
-  out->set_dims(output_dims);
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(dtype);
 }
 
-void TransposeInferMeta(const MetaTensor& x,
-                        const std::vector<int>& axis,
-                        MetaTensor* out) {
+void OneHotInferMeta(const MetaTensor& x,
+                     const Scalar& depth_t,
+                     MetaTensor* out) {
   auto x_dims = x.dims();
-  size_t x_rank = x_dims.size();
-  size_t axis_size = axis.size();
-
-  PADDLE_ENFORCE_EQ(
-      x_rank,
-      axis_size,
-      errors::InvalidArgument("The input tensor's dimension "
-                              "should be equal to the axis's size. "
-                              "But received input tensor's dimension is %d, "
-                              "axis's size is %d",
-                              x_rank,
-                              axis_size));
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_ENFORCE_GE(
-        axis[i],
-        0,
-        errors::InvalidArgument("The axis should be greater than or equal to 0."
-                                "But received %d of axis[%d]",
-                                axis[i],
-                                i));
-
-    PADDLE_ENFORCE_EQ(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        true,
-        errors::InvalidArgument(
-            "Each element of Attribute axis should "
-            "be a unique value range from 0 to (dims - 1), "
-            "where the dims is the axis's size, "
-            "unique value means this axis value can appear only once. "
-            "But received axis[%d] is %d, axis_size is %d, "
-            "count[axis[%d]] is %d",
-            i,
-            axis[i],
-            axis_size,
-            i,
-            count[axis[i]]));
-  }
-
-  phi::DDim out_dims(x_dims);
-  for (size_t i = 0; i < axis_size; ++i) {
-    out_dims[i] = x_dims[axis[i]];
-  }
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
+  int depth = depth_t.to<int>();
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-}
-
-void EighInferMeta(const MetaTensor& x,
-                   const std::string& uplo,
-                   MetaTensor* out_w,
-                   MetaTensor* out_v) {
-  auto input_dim = x.dims();
-  auto rank = input_dim.size();
-
-  PADDLE_ENFORCE_GE(rank,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The Input(X) should have at least 2 dimensions."
-                        "But received a %d dimension tensor.",
-                        rank));
-  PADDLE_ENFORCE_EQ(
-      input_dim[rank - 2],
-      input_dim[rank - 1],
-      phi::errors::InvalidArgument(
-          "Eigh op is designed for square matrix, consequently"
-          "inner-most 2 dimensions of Input(X) should be symmetric."
-          "But received X's shape[-2] = %d and shape[-1] = %d.",
-          input_dim[rank - 2],
-          input_dim[rank - 1]));
-
-  std::vector<int64_t> values_dim;
-
-  for (auto i = 0; i < rank - 1; i++) {
-    values_dim.emplace_back(input_dim[i]);
-  }
-  out_w->set_dims(phi::make_ddim(values_dim));
-  out_v->set_dims(input_dim);
+  out->share_lod(x);
+  out->set_dtype(phi::DataType::FLOAT32);
 }
 
 void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
@@ -1380,34 +1921,6 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
 }
 
-void ShardIndexInferMeta(const MetaTensor& in,
-                         int index_num,
-                         int nshards,
-                         int shard_id,
-                         int ignore_value,
-                         MetaTensor* out,
-                         MetaConfig config) {
-  auto x_dims = in.dims();
-  PADDLE_ENFORCE_GE(
-      x_dims.size(),
-      2,
-      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
-                                   "but the value given is %d.",
-                                   x_dims.size()));
-  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
-    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
-                      1U,
-                      phi::errors::InvalidArgument(
-                          "The last dimension of Input(X) should be 1, "
-                          "but the value given is %d.",
-                          x_dims[x_dims.size() - 1]));
-  }
-
-  out->set_dims(x_dims);
-  out->share_lod(in);
-  out->set_dtype(in.dtype());
-}
-
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index e8be73e943e09c9794376945cc904fe6f2a3d324..2d51bac995d5142871873dd4a12c22b4bf2de55e 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,6 +31,16 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
 
 void ArgsortInferMeta(const MetaTensor& input,
                       int axis,
@@ -38,26 +48,6 @@ void ArgsortInferMeta(const MetaTensor& input,
                       MetaTensor* output,
                       MetaTensor* indices);
 
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
-
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out);
-
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
-
-void FlattenInferMeta(const MetaTensor& x,
-                      int start_axis,
-                      int stop_axis,
-                      MetaTensor* out);
-
-void GumbelSoftmaxInferMeta(const MetaTensor& x,
-                            float temperature,
-                            bool hard,
-                            int axis,
-                            MetaTensor* out);
-
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
 void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
@@ -76,6 +66,34 @@ void CumsumInferMeta(const MetaTensor& x,
                      bool reverse,
                      MetaTensor* out);
 
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out);
+
+void DiagonalInferMeta(
+    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
+
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask);
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
+
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out);
+
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out);
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out);
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
@@ -84,32 +102,75 @@ void InferMetaFromVecValue(const MetaTensor& x,
 
 void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
 
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig = MetaConfig());
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config = MetaConfig());
+
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
+                   MetaTensor* out,
+                   MetaTensor* norm);
 
-void ReshapeInferMeta(const MetaTensor& x,
-                      const ScalarArray& shape,
-                      MetaTensor* out,
-                      MetaConfig config = MetaConfig());
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config = MetaConfig());
 
-void ReshapeWithXShapeInferMeta(const MetaTensor& x,
-                                const ScalarArray& shape,
-                                MetaTensor* xshape,
-                                MetaTensor* out,
-                                MetaConfig config = MetaConfig());
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out);
 
-void TileInferMeta(const MetaTensor& x,
-                   const ScalarArray& repeat_times,
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
                    MetaTensor* out,
                    MetaConfig config = MetaConfig());
 
-void SumRawInferMeta(const MetaTensor& x,
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ReduceInferMeta(const MetaTensor& x,
                      const std::vector<int64_t>& axis,
                      bool keep_dim,
-                     bool reduce_all,
-                     DataType dtype,
                      MetaTensor* out);
 
 void ReduceInferMetaBase(const MetaTensor& x,
@@ -118,10 +179,43 @@ void ReduceInferMetaBase(const MetaTensor& x,
                          bool reduce_all,
                          MetaTensor* out);
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out);
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config = MetaConfig());
+
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
+
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out);
+
+void SplitInferMeta(const MetaTensor& x_meta,
+                    const ScalarArray& num_or_sections,
+                    const Scalar& axis,
+                    std::vector<MetaTensor*> out,
+                    MetaConfig config = MetaConfig());
 
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
@@ -129,21 +223,48 @@ void SumInferMeta(const MetaTensor& x,
                   bool keep_dim,
                   MetaTensor* out);
 
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out);
+
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config = MetaConfig());
+
+void TraceInferMeta(
+    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
+
 void TransferLayoutInferMeta(const MetaTensor& x,
                              DataLayout layout,
                              MetaTensor* out);
 
-void SplitInferMeta(const MetaTensor& x_meta,
-                    const ScalarArray& num_or_sections,
-                    const Scalar& axis,
-                    std::vector<MetaTensor*> out,
-                    MetaConfig config = MetaConfig());
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out);
 
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
                      std::vector<MetaTensor>* outs);
-void TraceInferMeta(
-    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out);
 
 void UnfoldInferMeta(const MetaTensor& x,
                      const std::vector<int>& kernel_sizes,
@@ -153,54 +274,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out);
-
-void ArgMinMaxInferMeta(const MetaTensor& x,
-                        int64_t axis,
-                        bool keepdims,
-                        bool flatten,
-                        int dtype,
-                        MetaTensor* out,
-                        MetaConfig config = MetaConfig());
-
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
-
-void PadInferMeta(const MetaTensor& input,
-                  const std::vector<int>& paddings,
-                  float pad_value,
-                  MetaTensor* out,
-                  MetaConfig config = MetaConfig());
-
-void DiagonalInferMeta(
-    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
-
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out);
-
-void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
-
-void TransposeInferMeta(const MetaTensor& x,
-                        const std::vector<int>& axis,
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
                         MetaTensor* out);
 
-void EighInferMeta(const MetaTensor& x,
-                   const std::string& uplo,
-                   MetaTensor* out_w,
-                   MetaTensor* out_v);
+void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out);
 
 void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
 
-void ShardIndexInferMeta(const MetaTensor& in,
-                         int index_num,
-                         int nshards,
-                         int shard_id,
-                         int ignore_value,
-                         MetaTensor* out,
-                         MetaConfig config = MetaConfig());
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 093cb6549797d198ccaaff533357243a51188a74..d140912aa783047ba021be171805adff071bf22b 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 # [ 1. Common kernel compilation dependencies ]
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
@@ -27,22 +27,33 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel)
-kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
-kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
+    hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
+    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
+    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
+    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
+    triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
 kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling)
 kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
-kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
-kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index f34e5710ab7294425bacba4e5d5782859ac5f081..241a80d85ead2d7bb6cd63105feb345c62a29a62 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -19,37 +19,148 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \
   template <typename T, typename Context>         \
   void name##GradKernel(const Context& dev_ctx,   \
                         const DenseTensor& x,     \
                         const DenseTensor& dout,  \
                         DenseTensor* dx);
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
+  template <typename T, typename Context>                               \
+  void name##GradKernel(const Context& dev_ctx,                         \
+                        const DenseTensor& x,                           \
+                        const DenseTensor& dout,                        \
+                        float attr1,                                    \
+                        float attr2,                                    \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \
   template <typename T, typename Context>           \
   void name##GradKernel(const Context& dev_ctx,     \
                         const DenseTensor& out,     \
                         const DenseTensor& dout,    \
                         DenseTensor* dx);
 
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                         \
+  void name##GradKernel(const Context& dev_ctx,                   \
+                        const DenseTensor& out,                   \
+                        const DenseTensor& dout,                  \
+                        float attr,                               \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(const Context& dev_ctx,                           \
+                        const DenseTensor& out,                           \
+                        const DenseTensor& dout,                          \
+                        float attr1,                                      \
+                        float attr2,                                      \
+                        DenseTensor* dx);
+
 template <typename T, typename Context>
 void ReluDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
                           const DenseTensor& ddx,
                           DenseTensor* ddout);
 
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx);
+
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx);
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index bdf8f4363598f8c25e6f128b3f38f13f23005828..dbc63a636edb188e4640fdd02895868034f1dd80 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -24,6 +24,21 @@ namespace phi {
   void name##Kernel(                      \
       const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
+  template <typename T, typename Context>                    \
+  void name##Kernel(const Context& dev_ctx,                  \
+                    const DenseTensor& x,                    \
+                    float attr,                              \
+                    DenseTensor* out);
+
+#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    float attr1,                                     \
+                    float attr2,                                     \
+                    DenseTensor* out);
+
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
 DECLARE_ACTIVATION_KERNEL(Acos)
@@ -36,5 +51,18 @@ DECLARE_ACTIVATION_KERNEL(Asinh)
 DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
+DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL(Sigmoid)
+DECLARE_ACTIVATION_KERNEL(LogSigmoid)
+
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 }  // namespace phi
diff --git a/paddle/phi/kernels/allclose_kernel.h b/paddle/phi/kernels/allclose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f24078b86ca1736411cb929754441d737ee6028
--- /dev/null
+++ b/paddle/phi/kernels/allclose_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9faaace69176690f64ff81138844567deceef689
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out) {
+  if (!x.is_initialized()) {
+    return;
+  }
+  auto& x_tensor = *x.get_ptr();
+  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+}
+
+// Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
+// as input if needed
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    AssignKernel<Context>(dev_ctx, *x[i], out.at(i));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cc06818dc007f859a3a3513d211008cd2a153e6
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// In order to be compatible with the `AsDispensable` input in the original
+// assign op maker, the input parameter here needs to be dispensable, but
+// this looks weird
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out);
+
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index 4e72159aeca671614ccfe483ec1496f70e6b1d6a..cf83ab9aaabe135573a2887a01166f4a7bd0d5e1 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Concat(const Context& dev_ctx,
 
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index fe43ebb816077432ca4e7f678be4591e5d31b6f7..c582261596221f4db8bd03599386082cee909096 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -21,71 +21,215 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradImpl<T, Context, functor_class>(                  \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(        \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(      \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(      \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradImpl<T, Context, functor_class>(                    \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(          \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(    \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(    \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               LeakyReluGradFunctor,
+                                               alpha);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               ThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               SoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               HardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               BReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 HardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
+  auto out_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
+  auto dx_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
+  auto* place = dev_ctx.eigen_device();
+
+  if (alpha > 0) {
+    funcs::ELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  } else {
+    funcs::ELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  }
+}
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
 PD_REGISTER_KERNEL(
     relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
-PD_REGISTER_KERNEL(relu_double_grad,
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(                                         \
+      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
+                                          ReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
+                                          TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
+                                          LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+
+PD_REGISTER_KERNEL(tanh_triple_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReluDoubleGradKernel,
+                   phi::TanhTripleGradKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 51883f25183af7c8013bbfb403404397c8492988..1d7b77ea4445f494105d4c23516f31f349847089 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -19,37 +19,102 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
-  template <typename T, typename Context>                                \
-  void name##Kernel(                                                     \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
-    functor_class functor;                                               \
-    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(            \
+    name, functor_class, attr1, attr2)                   \
+  template <typename T, typename Context>                \
+  void name##Kernel(const Context& dev_ctx,              \
+                    const DenseTensor& x,                \
+                    float attr1,                         \
+                    float attr2,                         \
+                    DenseTensor* out) {                  \
+    funcs::functor_class<T> functor;                     \
+    auto attrs = functor.GetAttrs();                     \
+    *(attrs[0].second) = attr1;                          \
+    *(attrs[1].second) = attr2;                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                       \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
+
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     ThresholdedReluFunctor,
+                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
+
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     HardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
-PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
-PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
-PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
-PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
-PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
-PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
-PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
-PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
-PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
-PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
-PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ffeadfeed8aaa60c13b651188c5099c949b98ab
--- /dev/null
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include <cmath>
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  auto* in_a = x.data<T>();
+  auto* in_b = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  auto* out_data = dev_ctx.template Alloc<bool>(out);
+  *out_data = true;
+
+  auto num = x.numel();
+  for (int64_t i = 0; i < num; ++i) {
+    const T a = in_a[i], b = in_b[i];
+    bool val;
+    if (std::isnan(a) || std::isnan(b)) {
+      val = equal_nan && std::isnan(a) == std::isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol_v + (b > 0 ? rtol_v * b : (-rtol_v) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    *out_data &= val;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, CPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 1af071f23ddc520e6733acdbeec3a0652f4e1d8f..fa11fd05bf1d656a075b996f8688d755b28cc034 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx,
           << src_place;
 
   dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 
   if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a25f9650fc50fefa3899da13b55b985c164a394a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& d_out,
+                       int dim,
+                       DenseTensor* d_x) {
+  DDim shape = x.dims();
+
+  auto* d_out_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* out_data = out.data<T>();
+  auto* d_x_data = dev_ctx.template Alloc<T>(d_x);
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  // deal with complex
+  const T* x_data_deal;
+  const T* out_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr out_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
+    out_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                   .Allocate(numel * sizeof(T));
+    auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_out(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
+    for_range_out(functor_out);
+
+    x_data_deal = x_data_conj;
+    out_data_deal = out_data_conj;
+  } else {
+    x_data_deal = x_data;
+    out_data_deal = out_data;
+  }
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t k = 0; k < inner_dim; k++) {
+      for (size_t j = 0; j < mid_dim; j++) {
+        size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
+        d_x_data[index] = 0;
+        for (size_t n = 0; n < mid_dim; n++) {
+          size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
+          T elem;
+          if (j == 0) {
+            elem = d_out_data[pos];
+          } else {
+            elem = d_out_data[pos] * out_data_deal[index - inner_dim];
+          }
+          if (pos > index) {
+            for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
+              elem *= x_data_deal[m];
+            }
+          } else if (pos < index) {
+            elem = static_cast<T>(0);
+          }
+          d_x_data[index] += elem;
+        }
+      }
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cumprod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aea338027f5bb983788c382982dd2e1ad8db5e9a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include <cstdint>
+#include <type_traits>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int dim,
+                   DenseTensor* out) {
+  const DenseTensor* x = &input;
+  auto* x_data = x->data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  DDim shape = x->dims();
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t j = 0; j < mid_dim; j++) {
+      for (size_t k = 0; k < inner_dim; k++) {
+        size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
+        if (j == 0) {
+          out_data[pos] = x_data[pos];
+        } else {
+          out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d61f7be68af9cb23363a51065fd06d8b6492bfa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+inline void ModulatedDeformableIm2colCPUKernel(
+    const int num_kernels,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  for (int i = 0; i < num_kernels; i++) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  // get outputs of im2col with offset by bilinear interpolation
+  ModulatedDeformableIm2colCPUKernel(num_kernels,
+                                     data_im,
+                                     data_offset,
+                                     data_mask,
+                                     im_shape[1],
+                                     im_shape[2],
+                                     filter_shape[2],
+                                     filter_shape[3],
+                                     paddings[0],
+                                     paddings[1],
+                                     strides[0],
+                                     strides[1],
+                                     dilations[0],
+                                     dilations[1],
+                                     channel_per_deformable_group,
+                                     col_shape[1],
+                                     im_shape[0],
+                                     deformable_groups,
+                                     col_shape[2],
+                                     col_shape[3],
+                                     data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e57d7263f88bfc7b15910f05ec9fa6b45c213e7e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/phi/kernels/cpu/determinant_kernel.cc
similarity index 63%
rename from paddle/fluid/operators/reduce_ops/reduce_all_op.cu
rename to paddle/phi/kernels/cpu/determinant_kernel.cc
index a1f1a228aeb3a20807059a306a2fbff22d4a0bb8..5810e88e92527fac2c643b239661c50df32cd6f9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/phi/kernels/cpu/determinant_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all,
-    ops::ReduceCudaKernel<bool, kps::LogicalAndFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c56b225e2a753f963651f5e3f0a5cf711f5bb8a6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+    dout_data +=
+        (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+    for (int i = 0; i < dx_length; i++) {
+      dx_data[i * dx_stride] = dout_data[i * (dout_stride_0 + dout_stride_1)];
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    dx_data += (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+
+    auto dout_length = dout_dims[0];
+    for (int i = 0; i < dout_length; i++) {
+      dx_data[i * (dx_stride_0 + dx_stride_1)] = dout_data[i * dout_stride_0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
index d1e0b8e31e78fd74e6a15722546971a3cb72807a..4b060f0372a5bf50d9378239dae635e5723d0c7a 100644
--- a/paddle/phi/kernels/cpu/diag_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -62,5 +62,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {}
+PD_REGISTER_KERNEL(diag,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   float,
+                   double,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index cd513e809fd84ace9b01b50aed537204b2be1684..bf6ec012b24443e877b235e17488725dc0d14151 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -259,3 +259,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
similarity index 75%
rename from paddle/phi/kernels/cpu/math_kernel.cc
rename to paddle/phi/kernels/cpu/elementwise_kernel.cc
index 250f656926c0536f71e5724eb9df779c1502a673..095d11720ce26622c31e517286d6f656869e62ff 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
-
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
 
@@ -55,30 +46,6 @@ namespace phi {
     }                                                                       \
   }
 
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 template <typename T, typename Context>
 void DivideRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -119,6 +86,25 @@ using complex128 = ::phi::dtype::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
@@ -164,20 +150,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-PD_REGISTER_KERNEL(
-    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0a6948018afce277725c50e3cbb0e17ab495a83
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_grad_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2GradFunction<T, int32_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2GradFunction<T, int64_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (x_grad->numel() == 0) return;
+
+  if (index_type == phi::DataType::INT32) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  } else if (index_type == phi::DataType::INT64) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9207a05b9dcce1daed95a1dbdb99db3c23c5c90d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2Function<T, int32_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2Function<T, int64_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) {
+    return;
+  }
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGather<T, int64_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..254c4ea5716d19c65da6a46748a43db8dbddd52b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_grad_kernel.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluGradFunctor {
+  template <typename Device, typename X, typename dOut, typename dX>
+  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
+    if (approximate) {
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+
+        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
+        const float kBeta =
+            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
+        const auto y =
+            (kAlpha *
+             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
+                .tanh();
+        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
+                        (static_cast<float>(1) + y +
+                         (casted_x - casted_x * y.square()) *
+                             (kAlpha + kBeta * casted_x.square())))
+                           .template cast<T>();
+      } else {
+        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
+        const T kBeta =
+            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
+        const auto y =
+            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
+        dx.device(d) = static_cast<T>(0.5) * dout *
+                       (static_cast<T>(1) + y +
+                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto dx_data = dx.data();
+      auto dout_data = dout.data();
+      int n = std::min(x.size(), dx.size());
+
+      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(first, 0, n * sizeof(T));
+      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(second, 0, n * sizeof(T));
+
+      // first = (0.5 * (1 + erf(x / sqrt(2))))
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, first, 1);
+      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
+      for (int i = 0; i < n; i++) {
+        first[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
+
+      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
+      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
+      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
+      phi::funcs::CBlas<T>::VEXP(n, second, second);
+      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
+      phi::funcs::CBlas<T>::SCAL(
+          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
+
+      // dx = dout * (first + second);
+      phi::funcs::CBlas<T>::VADD(n, first, second, first);
+      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
+
+      std::free(first);
+      std::free(second);
+#else
+      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
+      // exp(- x^2 / 2)
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+        auto first = static_cast<float>(0.5) *
+                     (static_cast<float>(1) +
+                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
+        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
+                      casted_x *
+                      (-static_cast<float>(0.5) * casted_x.square()).exp();
+        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
+      } else {
+        auto first =
+            static_cast<T>(0.5) *
+            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
+
+        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                      (-static_cast<T>(0.5) * x.square()).exp();
+        dx.device(d) = dout * (first + second);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluGradFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7af220574565ea96706c2a87aec6751c9203af4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+#include <algorithm>
+#include <cmath>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluFunctor {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out, bool approximate) const {
+    if (approximate) {
+      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp =
+            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
+             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
+                .tanh();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
+                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
+                        .tanh();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto out_data = out.data();
+      int n = std::min(x.size(), out.size());
+
+      std::memset(out_data, 0, n * sizeof(T));
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
+      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+      for (int i = 0; i < n; i++) {
+        out_data[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+      for (int i = 0; i < n; i++) {
+        out_data[i] *= static_cast<T>(0.5);
+      }
+#else
+      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..923cb8424115e00f07274f959ffe34adaa9a0327
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -0,0 +1,357 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+static inline void ClipWithMask(const CPUContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners,
+                                std::string padding_mode,
+                                DenseTensor* grid_slice,
+                                DenseTensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->Resize(grid_slice->dims());
+  ctx.Alloc<T>(grid_scale);
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocationsWithGrad(const CPUContext& ctx,
+                                      const DenseTensor& grid,
+                                      const int in_h,
+                                      const int in_w,
+                                      bool align_corners,
+                                      std::string padding_mode,
+                                      DenseTensor* grid_x,
+                                      DenseTensor* grid_y,
+                                      DenseTensor* grid_x_scale,
+                                      DenseTensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  ClipWithMask<T>(
+      ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale);
+  ClipWithMask<T>(
+      ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale);
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& d1,
+                                        const DenseTensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void GatherBilinearGrad(const CPUContext& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& output_grad,
+                               DenseTensor* grid_x,
+                               DenseTensor* grid_y,
+                               DenseTensor* grid_x_scale,
+                               DenseTensor* grid_y_scale,
+                               DenseTensor* input_grad,
+                               DenseTensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  if (grid_grad != nullptr) {
+    DenseTensor grid_grad_x, grid_grad_y;
+    grid_grad_x.Resize({n, out_h, out_w});
+    grid_grad_y.Resize({n, out_h, out_w});
+    ctx.Alloc<T>(&grid_grad_x);
+    ctx.Alloc<T>(&grid_grad_y);
+    auto grid_grad_x_t =
+        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+    auto grid_grad_y_t =
+        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            grid_grad_x_t(i, k, l) +=
+                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+            grid_grad_y_t(i, k, l) +=
+                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+
+    //  const T x_max = static_cast<T>(in_w - 1);
+    //  const T y_max = static_cast<T>(in_h - 1);
+
+    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+    // gather grid_grad [x, y] in 3rd Dim
+    T* grid_grad_data = grid_grad->data<T>();
+    T* grid_grad_x_data = grid_grad_x.data<T>();
+    T* grid_grad_y_data = grid_grad_y.data<T>();
+    for (int i = 0; i < n * out_h * out_w; i++) {
+      grid_grad_data[2 * i] = grid_grad_x_data[i];
+      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    }
+  }
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  x_grad->Resize({n, c, in_h, in_w});
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    dev_ctx.template Alloc<T>(grid_grad);
+    phi::funcs::SetConstant<Context, T>()(
+        dev_ctx, grid_grad, static_cast<T>(0));
+  }
+
+  DenseTensor grid_x, grid_y;
+  DenseTensor grid_x_scale, grid_y_scale;
+  CalcGridLocationsWithGrad<T>(dev_ctx,
+                               grid,
+                               in_h,
+                               in_w,
+                               align_corners,
+                               padding_mode,
+                               &grid_x,
+                               &grid_y,
+                               &grid_x_scale,
+                               &grid_y_scale);
+  if (mode == "bilinear") {
+    GatherBilinearGrad<T>(dev_ctx,
+                          x,
+                          out_grid,
+                          &grid_x,
+                          &grid_y,
+                          &grid_x_scale,
+                          &grid_y_scale,
+                          x_grad,
+                          grid_grad);
+  } else {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92a528cdda96a191cf73115feb3cf3dd3656305d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+static inline void Clip(const CPUContext& ctx,
+                        DenseTensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners,
+                        std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocations(const CPUContext& ctx,
+                              const DenseTensor& grid,
+                              const int in_h,
+                              const int in_w,
+                              bool align_corners,
+                              std::string padding_mode,
+                              DenseTensor* grid_x,
+                              DenseTensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void BilinearInter(const CPUContext& ctx,
+                          const DenseTensor& input,
+                          DenseTensor* grid_x,
+                          DenseTensor* grid_y,
+                          DenseTensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+  dev_ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+
+  DenseTensor grid_x, grid_y;
+  CalcGridLocations<T>(
+      dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
+
+  if (mode == "bilinear") {
+    BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(x, out, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..53a16446d7e8c65b3d2d63835e6a2b86c1f96795
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void Unnormalize(const CPUContext& ctx,
+                 DenseTensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max) {
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+void GetGridPointValue(const DenseTensor& input,
+                       DenseTensor* output,
+                       const DenseTensor& x,
+                       const DenseTensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void AllNeigbors(const CPUContext& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* grid_x,
+                 DenseTensor* grid_y,
+                 DenseTensor* x_w,
+                 DenseTensor* x_e,
+                 DenseTensor* y_n,
+                 DenseTensor* y_s,  // positions
+                 DenseTensor* d_w,
+                 DenseTensor* d_e,
+                 DenseTensor* d_n,
+                 DenseTensor* d_s,  // distance
+                 DenseTensor* v_wn,
+                 DenseTensor* v_en,
+                 DenseTensor* v_ws,
+                 DenseTensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  x_w->Resize({n, out_h, out_w});
+  x_e->Resize({n, out_h, out_w});
+  y_n->Resize({n, out_h, out_w});
+  y_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(x_w);
+  ctx.Alloc<T>(x_e);
+  ctx.Alloc<T>(y_n);
+  ctx.Alloc<T>(y_s);
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+
+  // calculate distances to 4 sides
+  d_w->Resize({n, out_h, out_w});
+  d_e->Resize({n, out_h, out_w});
+  d_n->Resize({n, out_h, out_w});
+  d_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(d_w);
+  ctx.Alloc<T>(d_e);
+  ctx.Alloc<T>(d_n);
+  ctx.Alloc<T>(d_s);
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->Resize({n, c, out_h, out_w});
+  v_en->Resize({n, c, out_h, out_w});
+  v_ws->Resize({n, c, out_h, out_w});
+  v_es->Resize({n, c, out_h, out_w});
+  ctx.Alloc<T>(v_wn);
+  ctx.Alloc<T>(v_en);
+  ctx.Alloc<T>(v_ws);
+  ctx.Alloc<T>(v_es);
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..b79aab96c0fc2251f35fe93b525a03676e01fdb1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernelImpl(
+    const Context& ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const DenseTensor& label,
+    const DenseTensor& pre_out,
+    const DenseTensor& out_grad,
+    paddle::optional<const DenseTensor&> path,
+    paddle::optional<const DenseTensor&> code,
+    paddle::optional<const DenseTensor&> bias,
+    int num_classes,
+    bool remote_prefetch,
+    int trainer_id,
+    const std::vector<int64_t>& height_sections,
+    const std::vector<std::string>& epmap,
+    const std::vector<std::string>& table_names,
+    bool is_sparse,
+    DenseTensor* x_grad,
+    DenseTensor* w_grad,
+    DenseTensor* bias_grad,
+    SelectedRows* w_grad_sr = nullptr) {
+  funcs::SetConstant<Context, T> zero;
+  DenseTensor pre_out_grad;
+
+  pre_out_grad.Resize(pre_out.dims());
+  ctx.template Alloc<T>(&pre_out_grad);
+  ctx.template Alloc<T>(x_grad);
+  zero(ctx, x_grad, static_cast<T>(0.0));
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  // softrelu derivative
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+
+  auto* pre_out_grad_data = pre_out_grad.data<T>();
+  auto* pre_out_data = pre_out.template data<T>();
+  auto n = pre_out.numel();
+  blas.VEXP(n, pre_out_data, pre_out_grad_data);
+  blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+  for (int64_t i = 0; i < n; ++i) {
+    pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+  }
+  bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+  auto* out_grad_data = out_grad.template data<T>();
+
+  int64_t dim0 = pre_out_grad.dims()[0];
+  int64_t dim1 = pre_out_grad.dims()[1];
+  for (int64_t i = 0; i < dim0; ++i) {
+    T tmp = out_grad_data[i];
+    blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+  }
+  // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+  // be consistent with the clipping in forward.
+  if (bias_grad) {
+    ctx.template Alloc<T>(bias_grad);
+    zero(ctx, bias_grad, static_cast<T>(0.0));
+    bit_code->AddGrad(pre_out_grad, bias_grad);
+  }
+  ctx.template Alloc<T>(w_grad);
+  zero(ctx, w_grad, static_cast<T>(0.0));
+  if (!is_sparse) {
+    bit_code->MulGradWeight(pre_out_grad, w_grad, x);
+  } else {
+    bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x);
+  }
+  bit_code->MulGradError(pre_out_grad, w, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f64a1a8162a379bdad99c6519ef996a4203544a7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad) {
+  HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                       x,
+                                       w,
+                                       label,
+                                       pre_out,
+                                       out_grad,
+                                       path,
+                                       code,
+                                       bias,
+                                       num_classes,
+                                       remote_prefetch,
+                                       trainer_id,
+                                       height_sections,
+                                       epmap,
+                                       table_names,
+                                       is_sparse,
+                                       x_grad,
+                                       w_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..096a54f9fb263d3c153ab687d83bb61c63b117d7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h"
+
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function_impl.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out) {
+  size_t num_classes_st = static_cast<size_t>(num_classes);
+  // for remote prefetch
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+  int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1]
+                                       : math::FindLastSet(num_classes_st - 1);
+  int64_t batch_size = x.dims()[0];
+  DenseTensor sum;
+  pre_out->Resize(phi::make_ddim({batch_size, code_length}));
+  ctx.template Alloc<T>(pre_out);
+  auto* pre_out_data = pre_out->data<T>();
+  auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+  // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+  // 0s can avoid out of path's loss.
+  funcs::SetConstant<Context, T> zero;
+  zero(ctx, pre_out, static_cast<T>(0.0));
+  auto& place = *ctx.eigen_device();
+  funcs::RowwiseSum<Context, T> row_sum;
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes_st, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  std::vector<int64_t> sum_dims({batch_size, 1UL});
+  sum.Resize(phi::make_ddim(sum_dims));
+  ctx.template Alloc<T>(&sum);
+  auto sum_mat = EigenMatrix<T>::From(sum);
+  ctx.template Alloc<T>(out);
+  auto out_mat = EigenMatrix<T>::From(*out);
+  if (bias.get_ptr()) {
+    bit_code->Add(*(bias.get_ptr()), pre_out);
+  }
+  bit_code->Mul(pre_out, w, x);
+  // clip to [-40, 40]
+  paddle::platform::Transform<Context> trans;
+  trans(ctx,
+        pre_out_data,
+        pre_out_data + pre_out->numel(),
+        pre_out_data,
+        paddle::operators::ClipFunctor<T>(static_cast<T>(-40.0),
+                                          static_cast<T>(40.0)));
+  bit_code->Sum(*pre_out, out, static_cast<T>(-1));
+  // use softrelu to calculate cross entropy
+  pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+  row_sum(ctx, *pre_out, &sum);
+  // TODO(guosheng): Subtract the out of path's loss, since not all
+  // class(leaf) nodes' path lengths equal code_length. But it won't break the
+  // gradient check since both have the out of path's loss and will cancel out
+  // each other.
+  out_mat.device(place) = sum_mat + out_mat;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dd50e7df8f06dad1b4a4e51b48cda8d7e2c91eb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  if (dim < 0) {
+    dim += out_grad.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectGradInner<Context, T, int64_t>(
+        ctx, out_grad, index, x_grad, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..163174580ff785910cc749711b2f917391a691ff
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, class Enable = void>
+struct IndexSelectAdd {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    for (int i = 0; i < slice_size; i++) {
+      dist_pointer[i] = src_pointer[i] + p_pointer[i];
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct IndexSelectAdd<
+    Context,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectInner(const Context& ctx,
+                      DenseTensor* input,
+                      const DenseTensor& index,
+                      DenseTensor* output,
+                      int dim) {
+  auto input_dim = input->dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = output->dims();
+  auto index_size = index.dims()[0];
+
+  DenseTensor index_cpu_copy;
+  if (!paddle::platform::is_cpu_place(index.place())) {
+    phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
+  }
+  const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
+                                 ? index.data<IndexT>()
+                                 : index_cpu_copy.data<IndexT>();
+  ctx.template Alloc<T>(output);
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+    PADDLE_ENFORCE_LT(
+        index_data[i],
+        input_dim[dim],
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+  }
+
+  VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; index_size: " << index_size;
+
+  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+
+  auto input_tensor = EigenTensor<T, 3>::From(*input);
+  auto output_tensor = EigenTensor<T, 3>::From(*output);
+
+  auto& place = *ctx.eigen_device();
+
+  for (auto j = 0; j < index_size; j++) {
+    IndexT index_value = index_data[j];
+    auto output_t = output_tensor.chip(j, 1);
+    output_t.device(place) = input_tensor.chip(index_value, 1);
+  }
+  input->Resize(input_dim);
+  output->Resize(output_dim);
+}
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectGradInner(const Context& ctx,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad,
+                          int dim) {
+  const T* input_data = out_grad.data<T>();
+  const IndexT* index_data = index.data<IndexT>();
+
+  const T* p_output = ctx.template Alloc<T>(x_grad);
+  T* out_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = out_grad.dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = x_grad->dims();
+
+  phi::funcs::SetConstant<Context, T> set_constant;
+  set_constant(ctx, x_grad, static_cast<T>(0.0));
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto input_width = slice_size * input_dim[dim];
+  auto output_width = slice_size * output_dim[dim];
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  auto index_size = index.dims()[0];
+  VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; input_width: " << input_width
+          << "; output_width: " << output_width
+          << "; index_size: " << index_size;
+
+  for (auto i = 0; i < outer_nums; i++) {
+    auto input_start_offset = i * input_width;
+    auto output_start_offset = i * output_width;
+
+    for (auto j = 0; j < index_size; j++) {
+      IndexT index_value = index_data[j];
+      auto src = input_data + input_start_offset + j * slice_size;
+      auto p_out = p_output + output_start_offset + index_value * slice_size;
+      auto dst = out_data + output_start_offset + index_value * slice_size;
+      IndexSelectAdd<Context, T> index_select_add;
+      index_select_add(ctx, slice_size, src, p_out, dst);
+    }
+  }
+  x_grad->Resize(output_dim);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5341ede6b2fd846ee3c14d092d166f2832e3bff7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto inputs = x;
+  if (dim < 0) {
+    dim += inputs.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..633c6ba093e42762e3d5b64415d6098c3add6b8a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9399d38d711f56305641c9f3170306bacdd6095
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c462b8ec32c89dfcf2657018baf9b13764f2858e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+
+namespace phi {}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
similarity index 80%
rename from paddle/phi/kernels/cpu/shape_kernel.cc
rename to paddle/phi/kernels/cpu/kron_grad_kernel.cc
index 073dc63b2a4348d4091af8c285f9ddebd799acc5..01f5e5404b61d3ac96ffd6a811e449eae260c27d 100644
--- a/paddle/phi/kernels/cpu/shape_kernel.cc
+++ b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
@@ -12,22 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/shape_kernel.h"
-#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+#include "paddle/phi/kernels/kron_grad_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(shape,
+PD_REGISTER_KERNEL(kron_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ShapeKernel,
-                   bool,
+                   phi::KronGradKernel,
                    int,
-                   int8_t,
-                   uint8_t,
                    int64_t,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/kron_kernel.cc b/paddle/phi/kernels/cpu/kron_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aaea509dc7641b8b6c44b031c44e2b210c0cde39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kron_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..185d6cbedc85db83032cecd3c2f6cd1b0f46cbaf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void kthvalueAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (axis == in_dims.size() - 1) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    memset(x_grad_data, 0, d_x->numel() * sizeof(T));
+    if (keepdim) {
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &d_out,
+                     &indices,
+                     x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &out_grad_tmp,
+                     &indices_tmp,
+                     x_grad_data);
+    }
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    DDim trans_dims(out_dims);
+    DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    DenseTensor trans_dO, trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+    if (keepdim) {
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, d_out, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, d_x->numel() * sizeof(T));
+    kthvalueAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, d_x, trans);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e436623cae7bbc27903eff8e2bf01a41ded9c94
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void getKthvalue(Type input_height,
+                        Type input_width,
+                        int input_dim,
+                        const DenseTensor* input,
+                        T* t_out,
+                        Type* t_indices,
+                        const int& k) {
+  bool partial_sort_flag = (k * 64) < input_width;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    } else {
+      std::nth_element(
+          col_vec.begin(),
+          col_vec.begin() + k - 1,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    }
+    t_out[i] = col_vec[k - 1].first;
+    t_indices[i] = col_vec[k - 1].second;
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  auto out_dims = output->dims();
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    getKthvalue<T, int64_t>(input_height,
+                            input_width,
+                            in_dims.size(),
+                            &x,
+                            output_data,
+                            indices_data,
+                            k);
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, x, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    DenseTensor tmp_out, tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    tmp_indices.Resize(trans_out_dims);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    getKthvalue<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cee48ed96db1c60fb77dc7c870cb256b7ce0cb6e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale_opt,
+                         paddle::optional<const DenseTensor&> bias_opt,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  auto* scale = scale_opt.get_ptr();
+  auto d_y = out_grad;
+
+  // init output
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  const auto& x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  d_y.Resize(matrix_shape);
+
+  funcs::ColwiseSum2D<phi::CPUContext, T> colwise_sum(left, right, dev_ctx);
+  DenseTensor x_tmp = x;
+
+  DenseTensor temp;
+  DenseTensor temp_norm;
+  if (d_scale || d_x) {
+    x_tmp.Resize(matrix_shape);
+    temp.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp);
+
+    temp_norm.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp_norm);
+    // get x_norm
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx,
+        x_tmp,
+        mean,
+        /*axis*/ 0,
+        funcs::SubtractFunctor<T>(),
+        &temp_norm);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        &temp_norm);
+  }
+
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    colwise_sum(dev_ctx, d_y, d_bias);
+  }
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    colwise_sum(dev_ctx, temp, d_scale);
+  }
+
+  if (d_x) {
+    DDim vec_shape({left});
+    dev_ctx.template Alloc<T>(d_x);
+    auto dx_dim = d_x->dims();
+    DenseTensor temp_vec;
+    temp_vec.Resize(vec_shape);
+    dev_ctx.template Alloc<T>(&temp_vec);
+
+    funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+    if (d_scale) {
+      // dy_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          temp,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    } else {
+      // dy_dx
+      phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, d_y, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          d_y,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    }
+    // dy_var_dx
+    row_mean(dev_ctx, temp, &temp_vec);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        temp_vec,
+        /*axis*/ 0,
+        funcs::MultiplyFunctor<T>(),
+        &temp);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        *d_x,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        d_x);
+    d_x->Resize(dx_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b09d68c7ca081e9a6157857eea8338aaa93d34d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale_opt,
+                     paddle::optional<const DenseTensor&> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const auto x_dims = x.dims();
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  auto x_tmp = x;
+  x_tmp.Resize(matrix_shape);
+  DenseTensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
+    defined(__OSX__)
+
+  funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+  // get mean
+  row_mean(dev_ctx, x_tmp, mean);
+
+  // get variance
+
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+
+  row_mean(dev_ctx, out, var);
+
+  // get x_norm
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+      dev_ctx,
+      out,
+      *var,
+      0,
+      funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+      &out);
+
+  if (scale) {
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+  }
+  if (bias) {
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
+        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+  }
+#else
+  PADDLE_ENFORCE_EQ(mean->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "mean's length (%d) is not equal with expected (%d).",
+                        mean->numel(),
+                        left));
+  PADDLE_ENFORCE_EQ(var->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "var's length (%d) is not equal with expected (%d).",
+                        var->numel(),
+                        left));
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale->numel(),
+        right,
+        phi::errors::InvalidArgument(
+            "scale's length (%d) is not equal with expected (%d).",
+            scale->numel(),
+            right));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias->numel(),
+                      right,
+                      phi::errors::InvalidArgument(
+                          "bias's length (%d) is not equal with expected (%d).",
+                          bias->numel(),
+                          right));
+  }
+
+  auto ker = paddle::operators::jit::KernelFuncs<
+                 paddle::operators::jit::LayerNormTuple<T>,
+                 phi::CPUPlace>::Cache()
+                 .At(right);
+  ker(x_tmp.data<T>(),
+      out.data<T>(),
+      mean->data<T>(),
+      var->data<T>(),
+      scale ? scale->data<T>() : nullptr,
+      bias ? bias->data<T>() : nullptr,
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..116fa3f8d3f6a91ec0705b92ff65aa2a411f4f23
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f849322174d295d95fcd9080e090d5a7ece0ec79
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+struct LgammaFunctor {
+  LgammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::lgamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f344b9cc3fe0a4c71470c361f2e8f370bc5908a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Context, typename T>
+struct LogSoftmaxGradFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* Y,
+                  const DenseTensor* dY,
+                  DenseTensor* dX,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int n = funcs::SizeToAxis(axis, Y->dims());
+    const int d = funcs::SizeFromAxis(axis, Y->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto y = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+    auto dy = EigenMatrixTemplate<T>::From(*dY, dim_2d);
+    auto dx = EigenMatrixTemplate<T>::From(*dX, dim_2d);
+
+    const int axis_dim = Y->dims()[axis];
+    const int batch_size = y.dimension(kBatchDim);
+    const int num_classes = y.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    dx.device(*context.eigen_device()) =
+        dy -
+        (y.exp()) * (dy.reshape(batch_axis_remain)
+                         .sum(along_class)
+                         .broadcast(one_axis));
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad) {
+  const int rank = out.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  if (out.numel() != 0) {
+    LogSoftmaxGradFunctor<Context, T>()(
+        dev_ctx, &out, &out_grad, x_grad, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..241742378cc5d012d2816745d0f83fc586089ef7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename Context, typename T>
+struct LogSoftmaxFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* X,
+                  DenseTensor* Y,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    int axis_dim = X->dims()[axis];
+    const int n = funcs::SizeToAxis(axis, X->dims());
+    const int d = funcs::SizeFromAxis(axis, X->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto logits = EigenMatrixTemplate<T>::From(*X, dim_2d);
+    auto log_softmax = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      log_softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .eval()
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      log_softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .eval()
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<T>());
+    }
+
+    log_softmax.device(*context.eigen_device()) =
+        log_softmax -
+        log_softmax.exp()
+            .eval()
+            .reshape(batch_axis_remain)
+            .sum(along_axis)
+            .log()
+            .broadcast(one_axis);
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(out);
+  if (x.numel() != 0) {
+    LogSoftmaxFunctor<Context, T>()(dev_ctx, &x, out, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e13abe8aed2caf205871a24cfddff0b8b959498
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank, CPU, ALL_LAYOUT, phi::MatrixRankKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae1e406d16eec44168b2b7232586293bf90e4bd8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/SVD>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void BatchEigenvalues(const T* x_data,
+                      T* eigenvalues_data,
+                      int batches,
+                      int rows,
+                      int cols,
+                      int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, rows);
+    Eigen::SelfAdjointEigenSolver<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        eigen_solver(m);
+    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
+    for (int j = 0; j < k; j++) {
+      *(eigenvalues_data + i * k + j) = eigenvalues[j];
+    }
+  }
+}
+
+template <typename T>
+void BatchSVD(const T* x_data,
+              T* eigenvalues_data,
+              int batches,
+              int rows,
+              int cols,
+              int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  Eigen::BDCSVD<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      svd;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, cols);
+    svd.compute(m);
+    auto res_s = svd.singularValues();
+    for (int j = 0; j < k; j++) {
+      eigenvalues_data[i * k + j] = res_s[j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  } else {
+    BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+  int axis = -1;
+  if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
+    funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::GreaterThanFunctor<T, int64_t>(),
+        &compare_result);
+  } else {
+    funcs::ElementwiseCompute<funcs::LessThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::LessThanFunctor<T, int64_t>(),
+        &compare_result);
+  }
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank_tol, CPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca813c1757eacce24ecea8687b7b80bd43c5e8f9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  if (axis == in_dims.size() - 1) {
+    // allocate the memory for the input_grad
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    if (keepdim) {
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad,
+                        &indices,
+                        x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad_tmp,
+                        &indices_tmp,
+                        x_grad_data);
+    }
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    DDim trans_shape(out_dims);
+    DDim trans_in_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = out_dims[trans_axis[i]];
+      trans_in_shape[i] = in_dims[trans_axis[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_dO);
+
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_shape);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    int ndims = trans_axis.size();
+
+    if (keepdim) {
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans_axis);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    funcs::ModeAssign<T, int64_t>(input_height,
+                                  input_width,
+                                  in_dims.size(),
+                                  &trans_dO,
+                                  &trans_ind,
+                                  t_out);
+
+    // Transpose back
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6535d1b89af420ee4266981f004983157179f34f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  auto out_dims = out->dims();
+  // axis < 0, cacluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // if axis is not the last dim, transpose it to the last dim, do the
+  // calculation, then tranpose it back to original axis.
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetMode<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &x,
+                               output_data,
+                               indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    // get the trans input_dims, out_dims
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+
+    // transpose the input value
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_out_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_out_shape);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    funcs::GetMode<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
+    // transpose back
+    funcs::TransCompute<CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans_axis);
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a426e93db2cf23962276632fead69565999d37
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  auto* index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d9f4c51a981ed8701afe0aa4e7d6a8955f4348c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  auto index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/one_hot_v2_op.h
rename to paddle/phi/kernels/cpu/one_hot_kernel.cc
index 9d42c5875bb6eecd1244ca1a0dd6442985ec2a02..dc58489ebf70eaaa7efba52775f7ac62bb2ef5b2 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,23 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
 template <typename DeviceContext, typename InT>
 struct OneHotV2OpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
+  const DenseTensor* in_;
+  DenseTensor* out_;
   int depth_;
   const DeviceContext& ctx_;
   bool allow_out_of_range_;
 
-  OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                    int depth, const DeviceContext& ctx,
+  OneHotV2OpFunctor(const DenseTensor* in,
+                    DenseTensor* out,
+                    int depth,
+                    const DeviceContext& ctx,
                     bool allow_out_of_range = false)
       : in_(in),
         out_(out),
@@ -40,8 +42,8 @@ struct OneHotV2OpFunctor {
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    phi::funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
@@ -52,51 +54,46 @@ struct OneHotV2OpFunctor {
     } else {
       for (int i = 0; i < numel; ++i) {
         PADDLE_ENFORCE_GE(
-            p_in_data[i], 0,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            0,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be at least 0, "
                 "but received input (%d) less than 0",
                 p_in_data[i]));
         PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            depth_,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be less than "
                 "Input(depth), "
                 "but received input (%d) not less than depth (%d)",
-                p_in_data[i], depth_));
+                p_in_data[i],
+                depth_));
         *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
       }
     }
   }
 };
 
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
   }
-};
 
-}  // namespace operators
-}  // namespace paddle
+  phi::VisitDataType(dtype,
+                     OneHotV2OpFunctor<Context, T>(
+                         &x, out, depth, dev_ctx, allow_out_of_range));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb97694d8fc38d92f5290894a2c45dd21e7b1717
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(
+    pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d57e282c3c8ae85573bf11eff43e6551a808ea0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97558cdb31f666fd7c5dd8b15e1d7feef6556a0b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  const T* alpha_ptr = alpha.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (x_grad) {
+    T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i];
+      }
+    }
+  }
+
+  index = 0;
+  if (alpha_grad) {
+    T* alpha_grad_ptr = dev_ctx.template Alloc<T>(alpha_grad);
+    memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel());
+
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f389ab9ff459d1935518f35e7884d144bec5020
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (mode == "channel") {
+    if (data_format == "NCHW") {
+      int temp = 1;
+      for (int j = 2; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = (i / temp) % dim[1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        index = i % dim[dim.size() - 1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    }
+  } else if (mode == "element") {
+    int temp = 1;
+    for (int j = 1; j < dim.size(); j++) {
+      temp *= dim[j];
+    }
+    for (i = 0; i < numel; i++) {
+      index = i % temp;
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+    }
+  } else {
+    for (i = 0; i < numel; i++) {
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbed3f1cb133ada68b90a5283fc182373488c565
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  if (dx) {
+    auto in_dims = x.dims();
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num_t = rois.dims()[0];
+
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      auto* rois_num_t_data = rois_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_t_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_t_data[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+    const T* input_rois = rois.data<T>();
+    const T* dout_data = dout.data<T>();
+    T* dx_data = ctx.template Alloc<T>(dx);
+
+    // set gradient of X to be 0. before backpropagate.
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    // backpropagate gradient per output pixel
+    int dout_size = dout.numel();
+    for (int i = 0; i < dout_size; ++i) {
+      // The output is in order (n, c, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int c = (i / pooled_width / pooled_height) % output_channels;
+      int n = i / pooled_width / pooled_height / output_channels;
+
+      // set roi_batch_id
+      int roi_batch_id = rois_batch_id_data[n];
+      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      int input_offset =
+          (roi_batch_id * input_channels + input_channel) * height * width;
+      T* offset_dx_data = dx_data + input_offset;
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+      int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+      int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+      int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+      // Add roi offsets and clip to input boundaries
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Accumulate diff_val into input data
+      T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+      T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+      for (int ih = hstart; ih < hend; ++ih) {
+        for (int iw = wstart; iw < wend; ++iw) {
+          int input_index = ih * width + iw;
+          offset_dx_data[input_index] += diff_val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06cd03395d9656614995ef0ad91dad04b27717bf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num_t = rois.dims()[0];
+
+  PADDLE_ENFORCE_EQ(input_channels,
+                    output_channels * pooled_height * pooled_width,
+                    errors::InvalidArgument(
+                        "the channels of input "
+                        "X should equal the product of "
+                        "output_channels x pooled_height x pooled_width"));
+
+  auto in_stride = stride(in_dims);
+  auto out_stride = stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+
+  int rois_batch_size;
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_data[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_data[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_data[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument("the rois_batch_size and input(X) "
+                                "batch_size should be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod,
+                      rois_num_t,
+                      errors::InvalidArgument(
+                          "the rois_num from input and lod must be the same"));
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = ctx.template Alloc<T>(out);
+  const T* input_rois = rois.data<T>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num_t; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute bin size w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      int out_plane_offset = out_roi_offset + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        int out_row_offset = out_plane_offset + ph * out_stride[2];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+          int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+          int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          int output_index = out_row_offset + pw;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          int input_plane_offset =
+              roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+          const T* offset_input_data = input_data + input_plane_offset;
+          T out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[2] + iw;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          T bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2e32567441ae8ff5315856e3f9132c9553f6d62
--- /dev/null
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Dense>
+
+#include "paddle/phi/kernels/qr_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "QR received unrecognized mode '%s'"
+        " but expected one of 'reduced' (default), 'r', or 'complete'",
+        mode));
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r) {
+  bool compute_q;
+  bool reduced_mode;
+  std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
+  auto numel = x.numel();
+  PADDLE_ENFORCE_GT(
+      numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
+  auto x_dims = x.dims();
+  int x_rank = x_dims.size();
+  int m = x_dims[x_rank - 2];
+  int n = x_dims[x_rank - 1];
+  int min_mn = std::min(m, n);
+  int k = reduced_mode ? min_mn : m;
+  int batch_size = numel / (m * n);
+  int x_stride = m * n;
+  int q_stride = m * k;
+  int r_stride = k * n;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  T* q_data = nullptr;
+  if (compute_q) {
+    q_data = ctx.template Alloc<phi::dtype::Real<T>>(
+        q, batch_size * m * k * sizeof(phi::dtype::Real<T>));
+  }
+  auto* r_data = ctx.template Alloc<phi::dtype::Real<T>>(
+      r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
+
+  // Implement QR by calling Eigen
+  for (int i = 0; i < batch_size; ++i) {
+    const T* x_matrix_ptr = x_data + i * x_stride;
+    T* r_matrix_ptr = r_data + i * r_stride;
+    using EigenDynamicMatrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
+    Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
+    if (reduced_mode) {
+      auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
+      auto r_matrix_view =
+          qr_top_matrix.template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    } else {
+      auto r_matrix_view =
+          qr.matrixQR().template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    }
+
+    if (compute_q) {
+      T* q_matrix_ptr = q_data + i * q_stride;
+      if (reduced_mode) {
+        auto q_matrix =
+            qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      } else {
+        auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 4e268d40038cfb56b1e772e14b0ed7699f9700dd..af67bdf5d624f33fd4ec06db425ec8312b490642 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx,
   }
 }
 
+template <typename DeviceContext, typename OutT, typename Functor>
+void BoolReduceKernel(const DeviceContext& dev_ctx,
+                      const phi::DenseTensor& input,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      phi::DenseTensor* output) {
+  dev_ctx.template Alloc<OutT>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = input.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  ReduceKernelImpl<DeviceContext, bool, OutT, Functor>(
+      dev_ctx, input, output, dims, keep_dim, reduce_all);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
similarity index 53%
rename from paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
rename to paddle/phi/kernels/cpu/reduce_grad_kernel.cc
index efea054555e86be79b5cdb09fe8c4784a1ad0c3b..78a7ae8d415b5d4b18fdf8e469576db50f739e38 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
@@ -12,33 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/cpu/reduce_grad.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
 namespace phi {
 
-struct SumGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
 template <typename T, typename Context>
 void ComputeFromInput(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx,
     }
   }
 
-  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
-                                                     x,
-                                                     out_grad,
-                                                     paddle::none,
-                                                     dims,
-                                                     keep_dim,
-                                                     reduce_all,
-                                                     in_dtype,
-                                                     out_dtype,
-                                                     x_grad);
+  ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
+                                                            x,
+                                                            out_grad,
+                                                            paddle::none,
+                                                            dims,
+                                                            keep_dim,
+                                                            reduce_all,
+                                                            in_dtype,
+                                                            out_dtype,
+                                                            x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
+                                                             x,
+                                                             out_grad,
+                                                             paddle::none,
+                                                             dims,
+                                                             keep_dim,
+                                                             reduce_all,
+                                                             in_dtype,
+                                                             out_dtype,
+                                                             x_grad);
 }
 
 }  // namespace phi
@@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc99e2cb39a6976943ba8fa77f7816c8f5e9b284
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a91b8b6c1fcd3306521fb7cbc26d8c7adaf2d4f8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <class T>
+void bilinear_interpolate_gradient(const int height,
+                                   const int width,
+                                   T y,
+                                   T x,
+                                   const T out_grad_this_bin,
+                                   const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  auto in_dims = x.dims();
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = roi_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+
+  if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
+    return;
+  }
+
+  const T* boxes_data = boxes.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+  auto in_stride = phi::stride(x.dims());
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out_grad.dims());
+
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_idx = box_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    roi_width = std::max(roi_width, static_cast<T>(1.));
+    roi_height = std::max(roi_height, static_cast<T>(1.));
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    for (int c = 0; c < channels; ++c) {
+      T* batch_grad_data =
+          dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
+      const T* batch_out_grad_data =
+          out_grad_data + n * out_stride[0] + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int pool_index = ph * pooled_width + pw;
+          T out_grad_this_bin = batch_out_grad_data[pool_index];
+          int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+          int roi_bin_grid_w = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_width / pooled_width);
+          T count = roi_bin_grid_h * roi_bin_grid_w;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_ymin + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_xmin + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              bilinear_interpolate_gradient(height,
+                                            width,
+                                            y,
+                                            x,
+                                            out_grad_this_bin,
+                                            count,
+                                            batch_grad_data);
+            }
+          }
+        }
+      }
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_align_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiAlignGradKernel,
+                   float,
+                   double,
+                   int) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4752a9b3a48fdcce5f3211a7aadca663fb44aa05
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+constexpr size_t GetOffset(size_t x, size_t y, size_t width) {
+  return y * width + x;
+}
+
+template <class T>
+struct OffsetsAndRatios {
+  OffsetsAndRatios() = default;
+  OffsetsAndRatios(std::size_t xy,
+                   std::size_t xY,
+                   std::size_t Xy,
+                   std::size_t XY,
+                   T xy_ratio,
+                   T xY_ratio,
+                   T Xy_ratio,
+                   T XY_ratio)
+      : xy(xy),
+        xY(xY),
+        Xy(Xy),
+        XY(XY),
+        xy_ratio(xy_ratio),
+        xY_ratio(xY_ratio),
+        Xy_ratio(Xy_ratio),
+        XY_ratio(XY_ratio) {}
+
+  std::size_t xy = 0;
+  std::size_t xY = 0;
+  std::size_t Xy = 0;
+  std::size_t XY = 0;
+  T xy_ratio = 0.0f;
+  T xY_ratio = 0.0f;
+  T Xy_ratio = 0.0f;
+  T XY_ratio = 0.0f;
+};
+
+template <typename T>
+std::vector<OffsetsAndRatios<T>> GetIndexesAndRatios(
+    std::size_t width,
+    std::size_t height,
+    const T roi_width,
+    const T roi_height,
+    const T roi_xmin,
+    const T roi_ymin,
+    std::size_t pooled_width,
+    std::size_t roi_bin_grid_w,
+    std::size_t pooled_height,
+    std::size_t roi_bin_grid_h) {
+  const auto ind_num =
+      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
+
+  std::vector<OffsetsAndRatios<T>> interpolation_cords;
+  interpolation_cords.reserve(ind_num);
+
+  const auto bin_w = roi_width / pooled_width;
+  const auto bin_h = roi_height / pooled_height;
+
+  for (std::size_t py = 0; py < pooled_height; py++) {
+    for (std::size_t px = 0; px < pooled_width; px++) {
+      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
+        // calculate x of sample points
+        auto y =
+            roi_ymin +
+            bin_h * (py +
+                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
+          // calculate x of sample points
+          auto x = roi_xmin +
+                   bin_w * (px +
+                            static_cast<T>(ix + .5f) /
+                                static_cast<T>(roi_bin_grid_w));
+
+          // deal with elements out of map
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            interpolation_cords.emplace_back();
+            continue;
+          }
+          y = y <= 0 ? 0 : y;
+          x = x <= 0 ? 0 : x;
+
+          std::size_t x_low_index = static_cast<std::size_t>(x);
+          std::size_t x_high_index;
+          if (x_low_index >= width - 1) {
+            x_high_index = x_low_index = width - 1;
+            x = static_cast<T>(x_low_index);
+          } else {
+            x_high_index = x_low_index + 1;
+          }
+          T x_ratio = x_high_index - x;
+
+          std::size_t y_low_index = static_cast<std::size_t>(y);
+          std::size_t y_high_index;
+          if (y_low_index >= height - 1) {
+            y_high_index = y_low_index = height - 1;
+            y = static_cast<T>(y_low_index);
+          } else {
+            y_high_index = y_low_index + 1;
+          }
+          T y_ratio = y_high_index - y;
+
+          auto xy = GetOffset(x_low_index, y_low_index, width);
+          auto xY = GetOffset(x_low_index, y_high_index, width);
+          auto Xy = GetOffset(x_high_index, y_low_index, width);
+          auto XY = GetOffset(x_high_index, y_high_index, width);
+
+          auto xy_ratio = x_ratio * y_ratio;
+          auto xY_ratio = x_ratio * (1 - y_ratio);
+          auto Xy_ratio = (1 - x_ratio) * y_ratio;
+          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
+
+          interpolation_cords.emplace_back(
+              xy, xY, Xy, XY, xy_ratio, xY_ratio, Xy_ratio, XY_ratio);
+        }
+      }
+    }
+  }
+  return interpolation_cords;
+}
+
+template <typename T>
+void Interpolate(std::vector<T>& interpolated_values,  // NOLINT
+                 const std::vector<OffsetsAndRatios<T>>& interpolation_cords,
+                 const T* data) {
+  for (auto& ic : interpolation_cords) {
+    auto xlyl_offset = ic.xy;
+    auto xhyl_offset = ic.Xy;
+    auto xlyh_offset = ic.xY;
+    auto xhyh_offset = ic.XY;
+
+    auto xlyl_ratio = ic.xy_ratio;
+    auto xhyl_ratio = ic.Xy_ratio;
+    auto xlyh_ratio = ic.xY_ratio;
+    auto xhyh_ratio = ic.XY_ratio;
+
+    interpolated_values.emplace_back(
+        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
+        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
+  }
+}
+
+template <typename T>
+void AvgPool(const std::vector<T>& interpolated_values,
+             T* output_data,
+             int roi_bin_grid_w,
+             int roi_bin_grid_h,
+             int pooled_width,
+             int pooled_height) {
+  const auto data_amount = pooled_width * pooled_height;
+  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
+  const T count = 1.0 / grid_points;
+  auto val_begin = interpolated_values.cbegin();
+  for (auto i = 0; i < data_amount; ++i) {
+    T sum = 0.0;
+    auto val_end = val_begin + grid_points;
+    sum = std::accumulate(val_begin, val_end, sum);
+    val_begin = val_end;
+    output_data[i] = sum * count;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(in_dims);
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* roi_batch_id_data = roi_batch_id_list.data<int>();
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            boxes_batch_size,
+            batch_size));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(
+        lod.empty(),
+        false,
+        errors::InvalidArgument("Input(ROIs) Tensor of ROIAlignOp "
+                                "does not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  const T* boxes_data = boxes.data<T>();
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int roi_batch_id = roi_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    auto interpolation_cords = GetIndexesAndRatios(width,
+                                                   height,
+                                                   roi_width,
+                                                   roi_height,
+                                                   roi_xmin,
+                                                   roi_ymin,
+                                                   pooled_width,
+                                                   roi_bin_grid_w,
+                                                   pooled_height,
+                                                   roi_bin_grid_h);
+
+    std::vector<T> interpolated_values;
+    interpolated_values.reserve(interpolation_cords.size());
+    for (auto channel = 0; channel < channels; ++channel) {
+      Interpolate(interpolated_values, interpolation_cords, batch_data);
+      AvgPool(interpolated_values,
+              output_data,
+              roi_bin_grid_w,
+              roi_bin_grid_h,
+              pooled_width,
+              pooled_height);
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      interpolated_values.clear();
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0eaa873590eb0ce16933de474cc028e751fdd4a9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  if (dx) {
+    int rois_num = boxes.dims()[0];
+    DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+    int* box_batch_id_data = box_batch_id_list.data<int>();
+
+    int boxes_batch_size;
+    if (boxes_num) {
+      boxes_batch_size = boxes_num->numel();
+      auto* boxes_num_data = boxes_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_data[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_data[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    const T* boxes_data = boxes.data<T>();
+    const T* out_grad_data = out_grad.data<T>();
+    const int64_t* arg_max_data = arg_max.data<int64_t>();
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    auto in_stride = phi::stride(x.dims());
+    auto arg_max_stride = phi::stride(arg_max.dims());
+    auto roi_stride = phi::stride(boxes.dims());
+    auto out_stride = phi::stride(out_grad.dims());
+
+    int channels = x.dims()[1];
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = box_batch_id_data[n];
+      T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0];
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            if (arg_max_data[pool_index] >= 0) {
+              auto index = arg_max_data[pool_index];
+              batch_grad_data[index] += out_grad_data[pool_index];
+            }
+          }
+        }
+        batch_grad_data += in_stride[1];
+        out_grad_data += out_stride[1];
+        arg_max_data += arg_max_stride[1];
+      }
+      boxes_data += roi_stride[0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiPoolGradKernel,
+                   float,
+                   double,
+                   int) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02020354cd35701b5fdcd1e8beae87bc813ca18f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(x_dims);
+  auto arg_max_stride = phi::stride(arg_max->dims());
+  auto box_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = box_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    int rois_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        phi::errors::InvalidArgument("The rois_num from input "
+                                     "and lod must be the same."));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  const T* boxes_data = boxes.data<T>();
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_id = box_batch_id_data[n];
+    int box_start_w = round(boxes_data[0] * spatial_scale);
+    int box_start_h = round(boxes_data[1] * spatial_scale);
+    int box_end_w = round(boxes_data[2] * spatial_scale);
+    int box_end_h = round(boxes_data[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int box_height = std::max(box_end_h - box_start_h + 1, 1);
+    int box_width = std::max(box_end_w - box_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(box_height) / static_cast<float>(pooled_height);
+    const float bin_size_w =
+        static_cast<float>(box_width) / static_cast<float>(pooled_width);
+
+    const T* batch_data = input_data + box_batch_id * in_stride[0];
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          //  Compute pooling region for this output unit:
+          //  start (included) = floor(ph * box_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * box_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          hstart = std::min(std::max(hstart + box_start_h, 0), height);
+          hend = std::min(std::max(hend + box_start_h, 0), height);
+          wstart = std::min(std::max(wstart + box_start_w, 0), width);
+          wend = std::min(std::max(wend + box_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          output_data[pool_index] =
+              is_empty ? 0 : -std::numeric_limits<T>::max();
+          arg_max_data[pool_index] = -1;
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > output_data[pool_index]) {
+                output_data[pool_index] = batch_data[index];
+                arg_max_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      arg_max_data += arg_max_stride[1];
+    }
+    // Increment ROI data pointer
+    boxes_data += box_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d0c0663e4a2eb71f4500baaf43bc8a891acddd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = out_grad.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
+  x_grad->Resize(out_grad.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25b64ef257dfb801f0050aad388b9fb0b3020ea5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = x.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    PADDLE_ENFORCE_EQ(
+        dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(axis[%d]) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
+            i,
+            input_dim.size(),
+            input_dim.size() - 1,
+            i,
+            dims[i]));
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(out);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
+  out->Resize(x.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..924e71aff31f3f874fb35586f496b9c5952c3757
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T>
+inline void ShiftAlongDim(T* data,
+                          const DDim& input_dim,
+                          int64_t dim,
+                          int64_t shift) {
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  if (input_dim[dim] == 0) {
+    return;
+  }
+  shift = shift % input_dim[dim];
+  if (shift < 0) {
+    shift += input_dim[dim];
+  }
+
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_dim[i];
+  }
+  auto slice_width = 1;
+  for (auto i = dim + 1; i < input_dim.size(); i++) {
+    slice_width *= input_dim[i];
+  }
+
+  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
+          << "; dim: " << dim << "; shift: " << shift
+          << "; outer_loops: " << outer_loops
+          << "; slice_width: " << slice_width;
+  if (shift == 0) {
+    return;
+  }
+
+  std::vector<T> head;
+  auto head_size = slice_width * (input_dim[dim] - shift);
+  head.resize(head_size);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < head_size; j++) {
+      head[j] = data[i * input_dim[dim] * slice_width + j];
+    }
+    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
+      auto dst_pos = j - input_dim[dim] + shift;
+      for (auto k = 0; k < slice_width; k++) {
+        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
+            data[(i * input_dim[dim] + j) * slice_width + k];
+      }
+    }
+    for (auto j = 0; j < head_size; j++) {
+      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
similarity index 55%
rename from paddle/phi/kernels/cpu/reduce_prod_kernel.cc
rename to paddle/phi/kernels/cpu/searchsorted_kernel.cc
index cf0179124ebdfcb58a2ac3436fcbd4d5347bb6f2..c036c2d438a36be779441f8f9aef78c0b5fbb642 100644
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
@@ -12,32 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/searchsorted_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
 
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(searchsorted,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::SearchsortedKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
index 585c27bdcec97e11a68cdc536c829f76c000a8df..a5c9dc4c55e495833f40ec7499e6c0373594d319 100644
--- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
index d0413457f8177338aa450211539dc16d0880c74c..ad76a7a86bcb28f291288418c43740ed0b7adb97 100644
--- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44df36bb9fd87320db8548815b68a431e46bbcac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc
index 537b4326681a175fbad7593eed1d8b6caee9d86c..1d28669571f8d095cf53355be26135360008b0ce 100644
--- a/paddle/phi/kernels/cpu/softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/softmax_kernel.cc
@@ -19,4 +19,4 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
-    softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {}
+    softmax, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 324798effbe56b8b7bdf0c3d31b21cd079a8cf1c..ea8e2702c19d6edd9f63d1da647db0ef07a417f1 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -38,7 +38,7 @@ void SplitKernel(const Context& dev_ctx,
       out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
index 5aca5be12792387659b1c4db00e5d8ed98bc22dc..c91e7475f5b7c4ea7c420eb72cccd8cd82b0aa0c 100644
--- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14aca258a2c71a0651868f6917e2707987179ee0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d20e55e21fb6e11f63ef05f5de63fbc51caf5e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3cb17b28e07f3d9d4d0a1671acc9d639b855e08
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       int dim,
+                       DenseTensor* dx);
+}  // phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d76cb0f43702cb5798ec9c2d527464ea51ba1f
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int dim,
+                   DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
index fd90c7b8f5eee81b517013069ca9c2b366aa7d13..f105c94d559d873c8a11025a3c8c931010050445 100644
--- a/paddle/phi/kernels/cumsum_kernel.h
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -18,7 +18,7 @@
 
 namespace phi {
 
-template <typename Functor, typename Context>
+template <typename T, typename Context>
 void CumsumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   int axis,
diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3886e6801a31bf9f747b324ae4c355bd48c53cd7
--- /dev/null
+++ b/paddle/phi/kernels/deformable_conv_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/determinant_grad_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..87228afc51b52bb95838d3161f372c2ebae19b2c
--- /dev/null
+++ b/paddle/phi/kernels/determinant_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/determinant_kernel.h b/paddle/phi/kernels/determinant_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..abd5f5691b3e5df6274db1bcd3a915c80bf93ec4
--- /dev/null
+++ b/paddle/phi/kernels/determinant_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/diag_grad_kernel.h b/paddle/phi/kernels/diag_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9edab9bec44c367db2d36dfce05c425dcb07785
--- /dev/null
+++ b/paddle/phi/kernels/diag_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
index dd28752d9298345101d73913e405381c1d47c6c0..19653918302412e2f7e4dfa8caf71b6c9146a83f 100644
--- a/paddle/phi/kernels/eigh_kernel.h
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index 58ae11a9c4256d18dbacf6a40b06b308acaea159..fb2633cc9fcea7c619193ad964ad62247ed654dd 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -124,4 +124,22 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
                               DenseTensor* d_ddx,
                               DenseTensor* d_ddy);
 
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
similarity index 70%
rename from paddle/phi/kernels/math_kernel.cc
rename to paddle/phi/kernels/elementwise_kernel.cc
index a5d3f51e5447fa41447c4b59c3beb8c917f8a0e5..9d10a48c9e0795d8914c0c6cfb49b7686575cfac 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -12,34 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out) {
-  bool reduce_all = false;
-  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
-}
-
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(
-    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
-
-PD_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
 PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply,
                    phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(mean,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanKernel,
-                   float,
-                   double,
-                   bool,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
+
 PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
similarity index 57%
rename from paddle/phi/kernels/math_kernel.h
rename to paddle/phi/kernels/elementwise_kernel.h
index 7569cbcff087d796313c24d46ff7b7fd9cf7e2eb..b064ecc454c592df49670205163e73d2d3b249b3 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -1,57 +1,37 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out);
-
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out);
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
 
 template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out);
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
 
 template <typename T, typename Context>
 void AddRawKernel(const Context& dev_ctx,
@@ -149,29 +129,4 @@ DenseTensor Multiply(const Context& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Mean(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Sum(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
-  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
-  return dense_out;
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index e0db7b51f8e04b561afd30b740166cee9fdd6a78..942eecae16837ad37718fef540bd73e154d5e88a 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,11 +3,12 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 add_subdirectory(detail)
 
-math_library(math_function DEPS blas dense_tensor tensor)
-math_library(segment_pooling)
-math_library(sequence2batch)
+math_library(concat_and_split_functor DEPS dense_tensor)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(math_function DEPS blas dense_tensor tensor)
 math_library(matrix_reduce DEPS dense_tensor)
 math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
+math_library(pooling DEPS dense_tensor)
+math_library(segment_pooling)
+math_library(sequence2batch)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 1a36e4e132f41720b6f9fc563026082e21971d96..6c5ffbd06e3a435d9568a6c4717d8ce83b5aec00 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -29,11 +29,17 @@
 #include <type_traits>
 
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+
+#ifdef PADDLE_WITH_XPU_KP
+#define __forceinline__ __inline__
+#endif
 
 namespace phi {
 namespace funcs {
@@ -513,24 +519,24 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
   }
 };
 
 template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -539,291 +545,1431 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
 };
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> TanhTripleGrad ->    D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
 
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    D_Dout = (-2) * Out * DDx * D_Dout_new
+    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
+    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
 
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
+      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
+                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
+      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
+      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
+                         static_cast<T>(2) * out * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
 
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
   }
 };
 
 template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
 template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 =
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
+struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
   }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    if (ddOut) {
+      auto* d = dev.eigen_device();
+      auto ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
+      auto x = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
+    }
   }
-
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
   }
 };
 
 template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
   }
 };
 
 template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
 template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
 template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
 
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
   }
 };
 
 template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        (x < static_cast<T>(0))
+            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 1: alpha >= 0
+    // dx = dout, if out > 0
+    // dx = dout * (out + alpha), if out <= 0
+    dx.device(d) = (out > static_cast<T>(0))
+                       .select(dout, dout * (out + static_cast<T>(alpha)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 2: alpha < 0
+    // dx = dout, if x > 0
+    // dx = dout * (out + alpha), if x <=0
+    dx.device(d) = (x > static_cast<T>(0))
+                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
+
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
+      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
+                      (x <= static_cast<T>(0)).template cast<T>();
+    }
+
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
+      ddout.device(*d) = ddx *
+                         ((x > static_cast<T>(0)).template cast<T>() +
+                          static_cast<T>(alpha) * x.exp() *
+                              (x <= static_cast<T>(0)).template cast<T>())
+                             .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> SigmoidTripleGrad -> D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (1-2*Out)*DDx*D_Dout_new
+    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
+    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
+      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
+                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
+      d_dOut.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
+      d_ddx.device(*d) =
+          (static_cast<T>(1) - out) * out * d_ddOut +
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
+    return static_cast<T>(dout * one / (one - x * x));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
 
-  // dx = dout * 1/sqrt(x^2 + 1)
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > static_cast<T>(threshold) ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > static_cast<T>(threshold) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : static_cast<T>(alpha) * x;
+  }
+};
+
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > zero ? dout : static_cast<T>(alpha) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  __device__ __forceinline__ T operator()(const T x) const {
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // dx = dout, if x > lambda or x < -lambda else 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanhshrink(x) = x - tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x - tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * tanh(x)^2
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
+    return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename phi::dtype::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // elu(x) = x, if x > 0
+  // elu(x) = alpha * (e^x - 1), if x <= 0
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    CT x = static_cast<CT>(arg_x);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = x > zero ? x : temp;
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
 
-  // Atanh(x) = atanh(x)
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 1: alpha >= 0
+  // dx = dout, if out > 0
+  // dx = dout * (out + alpha), if out <= 0
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType a = static_cast<MPType>(alpha);
+    MPType out_pos = static_cast<MPType>(out > zero);
+    MPType out_neg = static_cast<MPType>(out <= zero);
+    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 2: alpha < 0
+  // dx = dout, if x > 0
+  // dx = dout * (out + alpha), if x <=0
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(alpha);
+    MPType x_pos = static_cast<MPType>(x > zero);
+    MPType x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // silu(x) = x / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
+    return static_cast<T>(x / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
+
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
-  // atan(x) = atan(x)
+  // sigmoid(x) = 1 / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
+    return static_cast<T>(one / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
+  // dx = dout * out * (1 - out)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out * (one - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  __device__ __forceinline__ T operator()(const T x) const {
+    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 840c8872f50f83c2859f07be2e0e7242a74004a7..06be592dd9375902cdbd0289caa347bc11015bd2 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -395,6 +395,8 @@ struct ConcatFunctor<phi::GPUContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+      VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
+      VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
       paddle::memory::allocation::Allocator::AllocationDeleter(
           data_alloc_released);
       paddle::memory::allocation::Allocator::AllocationDeleter(
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac40523c1c4378b3b8b20e9f32c59e664ee1eafc
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+static void GetCumprodDimInfo(const DDim& dim,
+                              int cumprod_dim,
+                              size_t* outer_dim,
+                              size_t* mid_dim,
+                              size_t* inner_dim) {
+  PADDLE_ENFORCE_GE(
+      cumprod_dim,
+      -dim.size(),
+      phi::errors::InvalidArgument(
+          "The input dim of CumprodOp should be larger than the opposite "
+          "rank of input x which is %d.But received dim=%d",
+          -dim.size(),
+          cumprod_dim));
+  PADDLE_ENFORCE_LT(cumprod_dim,
+                    dim.size(),
+                    phi::errors::InvalidArgument(
+                        "The input dim of CumprodOp should be smaller than the "
+                        "rank of input x which is %d.But received dim=%d",
+                        dim.size(),
+                        cumprod_dim));
+  if (cumprod_dim < 0) cumprod_dim += dim.size();
+
+  *outer_dim = 1;
+  for (int i = 0; i < cumprod_dim; ++i) {
+    *outer_dim *= dim[i];
+  }
+  *mid_dim = dim[cumprod_dim];
+  *inner_dim = 1;
+  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
+    *inner_dim *= dim[i];
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index b01d50015f01ad2fb2b1ab7c0c0be6f4f1b5acb8..ac262fe2d571e587e3bdfa6a2d4e58bd5b865e68 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -67,6 +67,11 @@ struct InverseMultiplyFunctor<bool> {
   }
 };
 
+template <typename T>
+struct IsZeroFunctor {
+  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
+};
+
 // Divide
 #define DIV_ERROR_INFO                                             \
   "InvalidArgumentError: Integer division by zero encountered in " \
@@ -159,6 +164,219 @@ struct DivGradYFunctor<ComplexType<T>> {
     return -a * out_div_c_conj;
   }
 };
+// Fmin
+template <typename T>
+struct FMinFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmin(a, b);
+  }
+};
+
+template <>
+struct FMinFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmin(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+// Fmax
+template <typename T>
+struct FMaxFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmax(a, b);
+  }
+};
+
+template <>
+struct FMaxFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmax(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+template <typename T>
+struct FMaxGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x >= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x >= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x >= y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x >= y));
+  }
+};
+
+template <typename T>
+struct FMaxGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x >= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x >= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x >= y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x >= y)));
+  }
+};
+
+template <typename T>
+struct FMinGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x <= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x <= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x <= y));
+  }
+};
+
+template <>
+struct FMinGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x <= y));
+  }
+};
+
+template <typename T>
+struct FMinGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x <= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x <= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x <= y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x <= y)));
+  }
+};
 
 template <typename T>
 struct MultiplyGradFunctor {
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78730cbf38495637e4bd4c455a3f522b38a9017
--- /dev/null
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class RowwiseMean2D<phi::GPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({right_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(false,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<phi::CPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  phi::funcs::RowwiseMean<phi::CPUContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const phi::DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class ColwiseSum2D<phi::GPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({left_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(true,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<phi::CPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  phi::funcs::ColwiseSum<phi::CPUContext, T> col_wise_;
+};
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 4201a75be8ac7ee9f7e633f6def1e002ce4b7e8a..afa2214f5b9df968d9fe01f6310e151c12e19362 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -331,12 +331,20 @@ template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
 
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
 
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
 template <typename T>
 struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
   void operator()(paddle::platform::CPUDeviceContext* ctx,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index c5b04a8106561962b6916907d86450a63c763830..1c6756f1720a23ada5bb4ff2fdb4f4840660ed58 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -39,7 +39,7 @@ void ComputeInverseEigen(const Context& dev_ctx,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(dev_ctx.GetPlace());
+  T* a_inv_ptr = dev_ctx.template Alloc<T>(a_inv);
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7641762e2639acf3db540280891b518f22eed2
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+static inline void GetDims(
+    const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void GetMode(Type input_height,
+                    Type input_width,
+                    int input_dim,
+                    const DenseTensor* input,
+                    T* t_out,
+                    Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height,
+                       const Type& input_width,
+                       const int& input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+static void GetModebySort(const phi::GPUContext& dev_ctx,
+                          const DenseTensor* input_tensor,
+                          const int64_t num_cols,
+                          const int64_t num_rows,
+                          T* out_tensor,
+                          int64_t* indices_tensor) {
+  DenseTensor input_tmp;
+  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
+  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
+
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(
+        thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device,
+                                           begin,
+                                           end - 1,
+                                           begin + 1,
+                                           0,
+                                           thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device,
+                          begin,
+                          end,
+                          thrust::constant_iterator<int>(1),
+                          keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(
+        thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
similarity index 83%
rename from paddle/fluid/operators/math/pooling.cc
rename to paddle/phi/kernels/funcs/pooling.cc
index f2e5e955ec487585deee1cbebba3d2932ee1b05d..10c88b9798c6ff69b755aa2c7423558c35afe859 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/pooling.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
+namespace phi {
+namespace funcs {
 
 /*
 * Tensors are in NCHW or NHWC format.
@@ -25,13 +29,16 @@ namespace math {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -50,7 +57,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -101,12 +108,16 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -131,7 +142,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -244,14 +255,19 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -270,7 +286,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -324,13 +340,18 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -357,7 +378,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -451,10 +472,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       h * input_width * input_channels + w * input_channels + c;
                   auto output_idx = ph * output_width * output_channels +
                                     pw * output_channels + c;
-                  pool_grad_process.compute(
-                      input_data[input_idx], output_data[output_idx],
-                      output_grad_data[output_idx], static_cast<T>(scale),
-                      input_grad_data + input_idx);
+                  pool_grad_process.compute(input_data[input_idx],
+                                            output_data[output_idx],
+                                            output_grad_data[output_idx],
+                                            static_cast<T>(scale),
+                                            input_grad_data + input_idx);
                 }
               }
             }
@@ -477,13 +499,16 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool2dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -502,7 +527,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -536,12 +561,15 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -568,7 +596,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_height * input_width;
@@ -641,29 +669,17 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool2dGradFunctor<CPUContext, float>;
+template class MaxPool2dGradFunctor<CPUContext, double>;
+
+template class Pool2dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
 * Tensors are in NCDHW or NDHWC format.
@@ -674,13 +690,16 @@ template class Pool2dGradFunctor<platform::CPUDeviceContext,
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
@@ -704,7 +723,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -771,12 +790,16 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -807,7 +830,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -966,14 +989,19 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -997,7 +1025,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1051,10 +1079,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     int input_idx = (d * input_height + h) * input_width + w;
                     int output_idx =
                         (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(
-                        input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx], static_cast<T>(scale),
-                        input_grad_data + input_idx);
+                    pool_grad_process.compute(input_data[input_idx],
+                                              output_data[output_idx],
+                                              output_grad_data[output_idx],
+                                              static_cast<T>(scale),
+                                              input_grad_data + input_idx);
                   }
                 }
               }
@@ -1068,13 +1097,18 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1105,7 +1139,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1164,10 +1198,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       int input_idx = (d * input_height + h) * input_width + w;
                       int output_idx =
                           (pd * output_height + ph) * output_width + pw;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1241,10 +1276,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                           ((pd * output_height + ph) * output_width + pw) *
                               output_channels +
                           c;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1270,13 +1306,16 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool3dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
@@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool3dGradFunctor<CPUContext, float>;
+template class MaxPool3dGradFunctor<CPUContext, double>;
+
+template class Pool3dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
  * All tensors are in NCHW format.
@@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor<platform::CPUDeviceContext,
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int hstart, hend;
     int wstart, wend;
@@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
     const int input_width = input_grad->dims()[3];
@@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, double, int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int dstart, dend;
     int hstart, hend;
@@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
     const int input_height = input_grad->dims()[3];
@@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, double, int>;
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/pooling.cu
rename to paddle/phi/kernels/funcs/pooling.cu
index 9d96345eb1f6dca6fc5eb6cf5847baaf1a9019da..417c1cd234754f994383988c63ff44ba06794822 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/pooling.h"
+
 #include <algorithm>
 #include <vector>
-
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 struct FastDivModForPooling {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
 
   explicit HOSTDEVICE FastDivModForPooling(const int channels,
                                            const int output_width,
                                            const int output_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(output_width);
-    height = platform::FastDivMod(output_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(output_width);
+    height = paddle::platform::FastDivMod(output_height);
   }
 };
 
 struct FastDivModForPoolingWithMoreStaff {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
-  platform::FastDivMod ksize_w;
-  platform::FastDivMod ksize_h;
-  platform::FastDivMod stride_w;
-  platform::FastDivMod stride_h;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
+  paddle::platform::FastDivMod ksize_w;
+  paddle::platform::FastDivMod ksize_h;
+  paddle::platform::FastDivMod stride_w;
+  paddle::platform::FastDivMod stride_h;
 
   explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff(
-      const int channels, const int input_width, const int input_height,
-      const int ksize_width, const int ksize_height, const int stride_width,
+      const int channels,
+      const int input_width,
+      const int input_height,
+      const int ksize_width,
+      const int ksize_height,
+      const int stride_width,
       const int stride_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(input_width);
-    height = platform::FastDivMod(input_height);
-    ksize_w = platform::FastDivMod(ksize_width);
-    ksize_h = platform::FastDivMod(ksize_height);
-    stride_w = platform::FastDivMod(stride_width);
-    stride_h = platform::FastDivMod(stride_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(input_width);
+    height = paddle::platform::FastDivMod(input_height);
+    ksize_w = paddle::platform::FastDivMod(ksize_width);
+    ksize_h = paddle::platform::FastDivMod(ksize_height);
+    stride_w = paddle::platform::FastDivMod(stride_width);
+    stride_h = paddle::platform::FastDivMod(stride_height);
   }
 };
 
 template <typename FastDivModForPooling>
-__device__ void OffsetPreparationFor4Dimension(
-    int index, bool channel_last, FastDivModForPooling divmods,
-    const int pad_width, const int pad_height, const int aux_width,
-    const int aux_height, int* w_offset, int* h_offset, int* c_offset,
-    int* stride) {
+__device__ void OffsetPreparationFor4Dimension(int index,
+                                               bool channel_last,
+                                               FastDivModForPooling divmods,
+                                               const int pad_width,
+                                               const int pad_height,
+                                               const int aux_width,
+                                               const int aux_height,
+                                               int* w_offset,
+                                               int* h_offset,
+                                               int* c_offset,
+                                               int* stride) {
   if (!channel_last) { /* NCHW */
     auto input_width_divmod = divmods.width.Divmod(index);
     auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]);
@@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension(
 }
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, FastDivModForPooling divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool2D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_height,
+                             const int input_width,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_height,
+                             const int padding_width,
+                             FastDivModForPooling divmods,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -139,25 +167,43 @@ __global__ void KernelPool2D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool2DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const const T* __restrict__ output_grad,
-    const int output_width, const int output_height, const int input_width,
-    const int input_height, const int ksize_width, const int ksize_height,
-    const int stride_width, const int stride_height, const int padding_width,
-    const int padding_height, FastDivModForPoolingWithMoreStaff divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive,
-    T* __restrict__ input_grad, bool channel_last = false) {
+__global__ void KernelPool2DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const const T* __restrict__ output_grad,
+                                 const int output_width,
+                                 const int output_height,
+                                 const int input_width,
+                                 const int input_height,
+                                 const int ksize_width,
+                                 const int ksize_height,
+                                 const int stride_width,
+                                 const int stride_height,
+                                 const int padding_width,
+                                 const int padding_height,
+                                 FastDivModForPoolingWithMoreStaff divmods,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* __restrict__ input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     T input = static_cast<T>(0);
     T input_grad_data = static_cast<T>(0);
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<>(index, channel_last, divmods,
-                                     padding_width, padding_height,
-                                     output_width, output_height, &w_offset,
-                                     &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<>(index,
+                                     channel_last,
+                                     divmods,
+                                     padding_width,
+                                     padding_height,
+                                     output_width,
+                                     output_height,
+                                     &w_offset,
+                                     &h_offset,
+                                     &c_offset,
+                                     &output_offset);
     if (pool_process.use_x) {
       input = input_data[index];
       output_data += output_offset;
@@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad(
                            : tmp_idx;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       } else {
@@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       }
@@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad, FastDivModForPooling divmods, bool channel_last = false) {
+__global__ void KernelMaxPool2DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    FastDivModForPooling divmods,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
     input_grad += input_offset;
 
@@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad(
 
     if (maxIndex != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
+                                      output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-  // platform::ChangeThreadNum(context, &thread_num);
+  // backends::gpu::ChangeThreadNum(context, &thread_num);
   thread_num = 512;
 #endif
   int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 
   auto pool_divmods =
       FastDivModForPooling(input_channels, output_width, output_height);
-  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_height, input_width, output_height,
-      output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_divmods, pool_compute, exclusive,
-      adaptive, output);
+  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_height,
+                                                             input_width,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_divmods,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -361,12 +455,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -375,17 +469,35 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
     const int batch_size = input.dims()[0];
 
@@ -410,12 +522,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -424,10 +536,25 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 /*
@@ -438,16 +565,18 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -465,30 +594,53 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -514,21 +666,41 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data,
-        channel_last);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data,
+                                                           channel_last);
   }
 };
 
@@ -540,16 +712,16 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool2dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -567,7 +739,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -577,17 +749,33 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -614,7 +802,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -625,71 +813,80 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
         FastDivModForPooling(input_channels, output_width, output_height);
 
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods, channel_last);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods,
+        channel_last);
   }
 };
 
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool2dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool2dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool2dGradFunctor<phi::GPUContext, float>;
+template class MaxPool2dGradFunctor<phi::GPUContext, double>;
+template class MaxPool2dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool2dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool2dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool3D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_depth,
+                             const int input_height,
+                             const int input_width,
+                             const int output_depth,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_depth,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_depth,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_depth,
+                             const int padding_height,
+                             const int padding_width,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -764,16 +961,31 @@ __global__ void KernelPool3D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool3DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const T* __restrict__ output_grad,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, PoolProcess pool_process, bool exclusive,
-    bool adaptive, T* input_grad, bool channel_last = false) {
+__global__ void KernelPool3DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const T* __restrict__ output_grad,
+                                 const int channels,
+                                 const int input_depth,
+                                 const int input_height,
+                                 const int input_width,
+                                 const int output_depth,
+                                 const int output_height,
+                                 const int output_width,
+                                 const int ksize_depth,
+                                 const int ksize_height,
+                                 const int ksize_width,
+                                 const int stride_depth,
+                                 const int stride_height,
+                                 const int stride_width,
+                                 const int padding_depth,
+                                 const int padding_height,
+                                 const int padding_width,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride;
@@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad(
                   : (pd * output_height + ph) * output_width + pw;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad,
-    bool channel_last = false) {
+__global__ void KernelMaxPool3DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_depth,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_depth,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_depth,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_depth,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_depth,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad(
     }
     if (maxIdx != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
   dim3 threads(thread_num, 1);
   dim3 grid(blocks, 1);
 
-  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_depth, input_height, input_width,
-      output_depth, output_height, output_width, ksize_depth, ksize_height,
-      ksize_width, stride_depth, stride_height, stride_width, padding_depth,
-      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_depth,
+                                                             input_height,
+                                                             input_width,
+                                                             output_depth,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_depth,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_depth,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_depth,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1034,31 +1287,52 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1089,24 +1363,42 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 
@@ -1119,16 +1411,18 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1152,7 +1446,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1161,21 +1455,43 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1206,7 +1522,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1215,11 +1531,30 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data,
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data,
         channel_last);  // add channel_last
   }
 };
@@ -1233,16 +1568,16 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool3dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data, channel_last);  // add channel_last
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        channel_last);  // add channel_last
   }
 };
 
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool3dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool3dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool3dGradFunctor<phi::GPUContext, float>;
+template class MaxPool3dGradFunctor<phi::GPUContext, double>;
+template class MaxPool3dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool3dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool3dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, bool adaptive, T1* output_data, T2* mask_data,
-    FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2dWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data,
+                                       FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, input_width, input_height, &w_offset,
-        &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, bool adaptive,
-    T1* input_grad, FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask_data,
+                                           const int channels,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad,
+                                           FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, output_width, output_height, &w_offset,
-        &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         output_width,
+                                                         output_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &output_offset);
     mask_data += output_offset;
     output_grad += output_offset;
 
@@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[1];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, adaptive, output_data,
-        mask_data, pool_divmods);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
+        mask_data,
+        pool_divmods);
   }
 };
 
@@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_height = input_grad->dims()[2];
@@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, input_width, input_height);
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_height,
-        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width, adaptive,
-        input_grad_data, pool_divmods);
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        input_grad_data,
+        pool_divmods);
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, double, int>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    bool adaptive, T1* output_data, T2* mask_data) {
+__global__ void KernelMaxPool3DWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_depth,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_depth,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_depth,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_depth,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_depth,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, bool adaptive, T1* input_grad) {
+__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask,
+                                           const int channels,
+                                           const int input_depth,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_depth,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_depth,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_depth,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_depth,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[2];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, adaptive, output_data,
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
         mask_data);
   }
 };
@@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_depth = input_grad->dims()[2];
@@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T1* output_grad_data = output_grad.data<T1>();
     const T2* mask_data = mask.data<T2>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width, adaptive,
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
         input_grad_data);
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, double, int>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa285dc69d1ca552afbd1f41e050ee603be07239
--- /dev/null
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -0,0 +1,469 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"  // import FLT_MAX
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) {}
+};
+
+template <class T>
+class AvgPool {
+  using MT = typename dtype::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+
+ public:
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+
+  DEVICE inline void compute(const T& x, T* y) {
+    intermediate_res += static_cast<MT>(x);
+  }
+
+  DEVICE inline void finalize(const T& pool_field, T* y) {
+    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
+  }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  static constexpr bool use_x = true;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += dy * static_cast<T>(x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  static constexpr bool use_x = false;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += (scale * dy);
+  }
+};
+
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
+ * is the number of channels, H and W is the height and width of feature.
+ * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
+ * is the number of channels, D, H and W is the depth, height and width of
+ * feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool2dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool3dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+inline int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int padding_1,
+                          int padding_2,
+                          int stride,
+                          bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
+            stride +
+        1;
+  }
+  PADDLE_ENFORCE_GT(
+      output_size,
+      0,
+      errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size,
+          input_size,
+          padding_1,
+          padding_2,
+          filter_size,
+          stride));
+  return output_size;
+}
+
+inline int MaxPoolOutputSize(int input_size,
+                             int filter_size,
+                             int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+template <typename T = int>
+inline void UpdatePadding(std::vector<T>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const DDim data_dims,
+                          const std::vector<T>& strides,
+                          const std::vector<T>& kernel_size) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2,
+                      paddings->size(),
+                      errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(),
+                          data_dims.size() * 2));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+template <typename T = int>
+inline void UpdateKernelSize(std::vector<T>* kernel_size,
+                             const DDim data_dims) {
+  kernel_size->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < kernel_size->size(); ++i) {
+    *(kernel_size->begin() + i) = static_cast<T>(data_dims[i]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5834f091d9a4de02afe7488ededc0189ae6f21d0..85c371e9f9d450c55741b901eff6f102fa6c3f6f 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// CUDA, XPU and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -220,7 +220,7 @@ struct IndexCalculator {
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
   phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
+#ifndef PADDLE_WITH_XPU_KP
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 #endif
 };
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
   HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
       : dim(dims) {}
 
+#ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     } else {
       return cluster_id() % dim.split_num_x;
     }
-#else
-    return blockIdx.x;
-#endif
   }
 
   __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() % dim.split_num_x);
     } else {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     }
-#else
-    return blockIdx.y;
-#endif
   }
 
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }
 
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimY() { return 1; }
 
   __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_y;
     } else {
       return dim.split_num_x;
     }
-#else
-    return gridDim.x;
-#endif
   }
 
   __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_x;
     } else {
       return dim.split_num_y;
     }
-#else
-    return gridDim.y;
-#endif
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.deal_size_y;
     } else {
       return dim.deal_size_x;
     }
+  }
 #else
-    return 1;
+  __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
+
+  __device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
+
+  __device__ __forceinline__ int BlockDimX() { return blockDim.x; }
+
+  __device__ __forceinline__ int BlockDimY() { return blockDim.y; }
+
+  __device__ __forceinline__ int GridDimX() { return gridDim.x; }
+
+  __device__ __forceinline__ int GridDimY() { return gridDim.y; }
+
+  __device__ int GetLoopSize() { return 1; }
 #endif
-  }
 };
 
 // when reduce_type == kReduceLastDim this struct will be used
@@ -341,7 +325,7 @@ struct ReduceConfig {
 
   // when should_reduce_again is true, we need malloc temp space for temp data
   void SetOutputData(Ty* y_data,
-                     const phi::GPUContext& dev_ctx,
+                     const KPDevice& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
       tmp->Resize(phi::make_ddim(
@@ -640,9 +624,7 @@ struct ReduceConfig {
   int blocking_size;
   bool should_reduce_again;
   bool reduce_last_dim;
-
   Ty* output_data;
-
   dim3 block;
   dim3 grid;
 };
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
 
     kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
         &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
+
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::details::WriteData<Ty>(
+        y + store_offset + i, &result, static_cast<int>(need_store));
   }
 }
 
@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
   }
 
   if (config.should_reduce_again) {
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
+    grid = 8;
+    block = 64;
+#endif
     ReduceHigherDimKernel<
         Ty,
         Ty,
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         config.left_num,
         config.grid.y,
         dim);
-#endif
   }
 }
 
@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
@@ -1087,12 +1030,16 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void ReduceKernel(const phi::GPUContext& dev_ctx,
+void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
                   phi::DenseTensor* y,
                   const TransformOp& transform,
                   const std::vector<int>& origin_reduce_dims) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
   auto stream = dev_ctx.stream();
+#endif
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
                0);
 
 #ifdef PADDLE_WITH_XPU_KP
+    auto grid_num = 8;
+    auto block_num = 64;
+#else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
+                          TransformOp><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
         config.left_num,
         config.blocking_size,
         dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
+      grid = 8;
+      block = 64;
+#endif
       ReduceHigherDimKernel<
           Ty,
           Ty,
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
           config.left_num,
           config.grid.y,
           dim2);
-#endif
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index 4e83d0fa3710324f5fddd729d10cb8a541791562..b793afb63b1dca9bbd8ad09b83461567de6371ad 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -49,5 +49,106 @@ struct MaxFunctor {
   }
 };
 
+//////// Min Functor ///////
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+//////// All Functor ///////
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+//////// Any Functor ///////
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
index 3488b6f2f86b20e0b758f3aa75a6739c40cd81db..11197a52261d7d0fd7618d2c7c0de09b57abe0d8 100644
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx,
   Eigen::array<int, D> broadcast_dim;
   for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
 
-  int broad_cats_times = 1;
+  int broad_cast_times = 1;
   for (size_t i = 0; i < dims_ref.size(); ++i) {
     if (dims_ref[i] < 0) {
       dims_ref[i] = x_rank + dims_ref[i];
     }
     reduced_dims_v[dims_ref[i]] = 1;
     broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
+    broad_cast_times *= x_dims[dims_ref[i]];
   }
   auto reduced_dims = phi::make_ddim(reduced_dims_v);
   auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
@@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx,
           &x_grad,
           &x_reduce_grad,
           broadcast_dim,
-          broad_cats_times);
+          broad_cast_times);
 }
 
 inline void GetOriginDimFromShuffled(const DDim& src_dim,
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index bf4a21f37223dab5a67649406496e9828b0bcf3f..fbd744430aa11ab1a5a17c76b6d37c10c3085556 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -149,10 +149,19 @@ template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
 template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolFunctor<CPU, int, int>;
+template class SegmentPoolFunctor<CPU, int, int64_t>;
+template class SegmentPoolFunctor<CPU, int64_t, int>;
+template class SegmentPoolFunctor<CPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<CPU, float, int>;
 template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int, int>;
+template class SegmentPoolGradFunctor<CPU, int, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 305cd39f077bc359543b399a8775b5a92a2eb00d..95606b152672916116813c97cbbc0856d33e49a7 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -453,10 +453,19 @@ template class SegmentPoolFunctor<GPU, float, int>;
 template class SegmentPoolFunctor<GPU, float, int64_t>;
 template class SegmentPoolFunctor<GPU, double, int>;
 template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolFunctor<GPU, int, int>;
+template class SegmentPoolFunctor<GPU, int, int64_t>;
+template class SegmentPoolFunctor<GPU, int64_t, int>;
+template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<GPU, float, int>;
 template class SegmentPoolGradFunctor<GPU, float, int64_t>;
 template class SegmentPoolGradFunctor<GPU, double, int>;
 template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int, int>;
+template class SegmentPoolGradFunctor<GPU, int, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a1d9b8ea7a7a36c31f31f7fc60dffc1f827d34e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+using Mode = kps::details::ReduceMode;
+
+/*
+* Count how many of the data being processed by the current block are true
+* 1. Load data from global memory and cast from bool to int64_t
+* 2. Get result of this thread according to thread reduce
+* 3. Get result of this block according to block reduce
+* 4. first block store 0 and current result
+*/
+template <typename T>
+struct NonZeroFunctor {
+  HOSTDEVICE NonZeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T in) {
+    if (in) {
+      return static_cast<T>(1);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, int IsBoundary>
+__device__ void GetBlockCountImpl(const InT *in,
+                                  OutT *out,
+                                  int num,
+                                  int repeat) {
+  InT in_data[VecSize];
+  OutT temp[VecSize];
+  OutT result = static_cast<OutT>(0.0f);
+  using Add = kps::AddFunctor<OutT>;
+  using Cast = NonZeroFunctor<InT>;
+  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
+
+  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
+  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+      &temp[0], &in_data[0], Cast());
+  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &result, &temp[0], Add(), true);
+  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+      &result, &result, Add(), true);
+  if (store_fix == 0) {
+    // first block's fix_size = 0;
+    OutT tmp = static_cast<OutT>(0.0f);
+    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+  }
+
+  // store num of this block
+  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+}
+
+// Count how many data is not zero in current block
+template <typename InT, typename OutT, int VecSize>
+__global__ void GetBlockCountKernel(const InT *in,
+                                    OutT *out,
+                                    int64_t numel,
+                                    int64_t main_offset) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    GetBlockCountImpl<InT, OutT, VecSize, false>(
+        in + data_offset, out, BLOCK_NUM_X * VecSize, repeat);
+    repeat++;  // to get the real blockIdx
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    GetBlockCountImpl<InT, OutT, VecSize, true>(
+        in + data_offset, out, num, repeat);
+  }
+}
+
+/*
+* Get block num prefix us one block, VecSize must be 2
+* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+* 2. Cumsum limitation is blockDim.x must be less than 512
+*/
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          bool IsBoundary>
+__device__ void CumsumImpl(
+    const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) {
+  __shared__ OutT max_thread_data;
+  OutT temp[VecSize];
+  InT arg[VecSize];
+  OutT result[VecSize];
+  // init data_pr
+  kps::Init<InT, VecSize>(&arg[0], static_cast<InT>(0.0f));
+  // set pre_cumsum
+  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
+  // load data to arg
+  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
+  // block cumsum
+  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  // result = cumsum_result + pre_cumsum
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+      &result[0], &result[0], &temp[0], func);
+  // get the last prefix sum
+  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
+    max_thread_data = result[VecSize - 1];
+  }
+  __syncthreads();
+  // update pre_cumsum
+  *pre_cumsum = max_thread_data;
+  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+      out, &result[0], num, 1, BLOCK_NUM_X, 1);
+}
+
+// Compute this store_offset of this block
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void CumsumOneBlock(
+    const InT *in, OutT *out, int numel, int main_offset, Functor func) {
+  int stride = BLOCK_NUM_X * VecSize;
+  int offset = 0;
+  OutT pre_cumsum = static_cast<OutT>(0);
+  for (; offset < main_offset; offset += stride) {
+    CumsumImpl<InT, OutT, Functor, VecSize, false>(
+        in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func);
+  }
+
+  int num = numel - offset;
+  if (num > 0) {
+    CumsumImpl<InT, OutT, Functor, VecSize, true>(
+        in + offset, out + offset, &pre_cumsum, num, func);
+  }
+}
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary,
+          int IsMaskData>
+struct SelectCaller {
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    // where_index op
+    IdT index_reg[VecSize];
+    // Set data index of global
+    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, IdT, OutT, Functor>(
+        store_data, mask_data, &index_reg[0], func, VecSize);
+  }
+};
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT,
+                    MT,
+                    InT,
+                    IdT,
+                    Functor,
+                    VecSize,
+                    IsBoundary,
+                    1> {  // masked_select
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    InT in_data[VecSize];
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+  }
+};
+
+/**
+* Get mask's index if mask == true
+*/
+template <typename InT,
+          typename MT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData,
+          int IsBoundary>  // SelectType = 1 Mask_select else where_index
+__device__ void
+SelectKernelImpl(OutT *out,
+                 const MT *mask,
+                 const InT *in,
+                 Functor func,
+                 int num,
+                 int data_offset,
+                 int store_rank) {
+  const int kCVecSize = 2;
+  // each thread cumsum 2 data
+  using IdT = int64_t;
+  // Set index data type
+  using Add = kps::AddFunctor<IdT>;  // for cumsum
+  using Cast = NonZeroFunctor<InT>;  // for mask
+
+  IdT init_idx = static_cast<IdT>(0.0f);
+  MT init_mask = static_cast<MT>(0.0f);
+
+  IdT num_thread[kCVecSize];
+  IdT cumsum_thread[kCVecSize];
+
+  OutT store_data[VecSize * phi::DDim::kMaxRank];
+  MT mask_data[VecSize];
+  IdT mask_idt[VecSize];
+  // init data_pr
+  kps::Init<IdT, kCVecSize>(&cumsum_thread[0], init_idx);
+  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
+  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
+  // Load mask
+  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  // Cast from MT to int
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+      &mask_idt[0], &mask_data[0], Cast());
+  // Get the num of thread only num_thread[1] has data
+  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &num_thread[0], &mask_idt[0], Add(), true);
+  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
+  // thread_fix
+  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
+      compute;
+  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
+  // get thread_fix
+  int thread_fix =
+      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
+  // get how many data need to store
+  int store_num = static_cast<int>(num_thread[0]) * store_rank;
+  // thread store num data, each thread may has different num
+  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+}
+
+template <typename MT,
+          typename InT,
+          typename CT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData>
+__global__ void SelectKernel(OutT *out,
+                             const MT *mask,
+                             const InT *in,
+                             CT *cumsum,
+                             Functor func,
+                             const int64_t numel,
+                             int64_t main_offset,
+                             int store_rank) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  int size = VecSize * BLOCK_ID_X;
+  for (; data_offset < main_offset; data_offset += stride) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = cumsum[idx_cumsum];
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        size,
+        data_offset,
+        store_rank);
+    repeat++;
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        num,
+        data_offset,
+        store_rank);
+  }
+}
+
+inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; }
+
+// SelectData = 1 then masked_select; SelectData = 0 then where_index
+template <typename MT,
+          typename InT,
+          typename OutT,
+          int SelectData,
+          typename Functor>
+void SelectKernel(const KPDevice &dev_ctx,
+                  const DenseTensor &condition,
+                  const DenseTensor &in_data,
+                  DenseTensor *out,
+                  Functor func) {
+  const MT *cond_data = condition.data<MT>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  int rank = SelectData ? 1 : dims.size();
+  const InT *in_data_ptr = SelectData ? in_data.data<InT>() : nullptr;
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  // alloc for cpu
+  using CT = int64_t;  // set Count_data Type
+  const int t_size = sizeof(CT);
+
+  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
+  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+
+  // 1.1 get stored data num of per block
+  int total_true_num = 0;  // init
+  const int kVecSize = 4;
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 8);
+#else
+  const int block = 256;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 256);
+  auto stream = dev_ctx.stream();
+#endif
+  const int64_t main_offset = Floor(numel, num_per_block);
+  // 1.2 alloc tmp data for CoutBlock
+  const int size_count_block = need_grids + 1;
+  std::vector<int> dims_vec = {size_count_block * 2};
+  ScalarArray dims_array(dims_vec);
+  DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *count_data = count_mem.data<CT>();
+  // 1.3 launch CountKernl
+  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
+      cond_data, count_data, numel, main_offset);
+  // 2.1 alloc cumsum data for CoutBlock prefix
+  DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *cumsum_data = cumsum_mem.data<CT>();
+  // 2.2 get prefix of count_data for real out_index
+  const int kCumVesize = 2;
+  const int block_c = 256;
+  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+  using Add = kps::AddFunctor<CT>;
+  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
+      count_data, cumsum_data, size_count_block, main_offset_c, Add());
+  // 3.1 set temp ptr for in;
+  // 3.1 alloc for out
+  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
+  paddle::memory::Copy(cpu_place,
+                       &total_true_num,
+                       cuda_place,
+                       cumsum_data + need_grids,
+                       t_size,
+                       dev_ctx.stream());
+
+  dev_ctx.Wait();
+  // 3.1.2 allock for out with total_true_num
+  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
+  if (SelectData == 0) {  // where_index
+    out_dim.push_back(rank);
+  }
+  out->Resize(phi::make_ddim(out_dim));
+  auto out_data = out->mutable_data<OutT>(cuda_place);
+  // 3.2 get true data's index according to cond_data and cumsum_data
+  if (total_true_num <= 0) return;
+  SelectKernel<MT,
+               InT,
+               CT,
+               OutT,
+               Functor,
+               kVecSize,
+               SelectData><<<grid, block, 0, stream>>>(out_data,
+                                                       cond_data,
+                                                       in_data_ptr,
+                                                       cumsum_data,
+                                                       func,
+                                                       numel,
+                                                       main_offset,
+                                                       rank);
+}
+
+}  // namespace funcs
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..3617e3cd2f406d889c0b79ecfc34a68d19259a17
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+inline const DDim InferDenseDims(const DDim& x_dims,
+                                 const int64_t sparse_dim,
+                                 const int64_t non_zero_num) {
+  auto dense_dim = x_dims.size() - sparse_dim;
+  DDim values_dims;
+  if (dense_dim > 0) {
+    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
+    dense_dim_vec[0] = non_zero_num;
+    memcpy(&dense_dim_vec[1],
+           x_dims.Get() + sparse_dim,
+           dense_dim * sizeof(x_dims[0]));
+    values_dims = phi::make_ddim(dense_dim_vec);
+  } else {
+    values_dims = phi::make_ddim({non_zero_num});
+  }
+  return values_dims;
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..19f1f3d3cd2fadff918da25cc873944e927a473a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+struct Dims4D {
+  int dims[4];
+  Dims4D(const int batch, const int x, const int y, const int z) {
+    dims[0] = batch;
+    dims[1] = z;
+    dims[2] = y;
+    dims[3] = x;
+  }
+  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
+};
+
+// Judge whether the current position x is in (lower, upper)
+inline HOSTDEVICE bool Check(const int& x,
+                             const int& kx,
+                             const int& pad,
+                             const int& stride,
+                             const int dilation,
+                             const int kdim,
+                             const int xdim) {
+  const int lower = x - dilation * kx + pad;
+  const int uper = x + (kdim - kx - 1) * dilation - pad;
+  return (lower >= 0 && lower % stride == 0 && uper < xdim);
+}
+
+// Check whether the current position(x, y, z) is legal:
+// Judge the minimum and maximum values at each latitude
+inline HOSTDEVICE bool Check(const Dims4D& dims,
+                             const Dims4D& kernel_dims,
+                             const Dims4D& paddings,
+                             const Dims4D& dilations,
+                             const Dims4D& strides,
+                             const int x,
+                             const int y,
+                             const int z,
+                             const int kx,
+                             const int ky,
+                             const int kz) {
+  bool x_valid = Check(
+      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
+  bool y_valid = Check(
+      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
+  bool z_valid = Check(
+      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
+  return (x_valid && y_valid && z_valid);
+}
+
+template <typename Dim>
+inline HOSTDEVICE int PointToIndex(const int& batch,
+                                   const int& x,
+                                   const int& y,
+                                   const int& z,
+                                   const Dim& dims) {
+  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
+         y * dims[3] + x;
+}
+
+// TODO(zhangkaihuo): use division and multiply to optimize
+// modulo operation
+template <typename Dim>
+inline HOSTDEVICE void IndexToPoint(
+    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
+  int n = index;
+  *x = n % dims[3];
+  n /= dims[3];
+  *y = n % dims[2];
+  n /= dims[2];
+  *z = n % dims[1];
+  n /= dims[1];
+  *batch = n;
+}
+
+inline void GetOutShape(const DDim& x_dims,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_sizes.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_sizes[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims,
+                                          std::vector<int>* paddings,
+                                          std::vector<int>* strides) {
+  for (uint64_t i = 0; i < paddings->size(); i++) {
+    (*paddings)[i] = kernel_dims[i] / 2;
+    (*strides)[i] = 1;
+  }
+}
+
+template <typename T, typename Context>
+inline void SubmPreProcess(const Context& dev_ctx,
+                           const SparseCooTensor& x,
+                           const DenseTensor& kernel,
+                           const DenseTensor& out_grad,
+                           const int in_channels,
+                           const int out_channels,
+                           const int half_kernel_size,
+                           DenseTensor* kernel_grad,
+                           DenseTensor* x_grad) {
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  blas.GEMM(CblasTrans,
+            CblasNoTrans,
+            x.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
+            x.non_zero_elements().dims()[0],
+            static_cast<T>(1),
+            x.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
+            static_cast<T>(0),
+            d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+  // call gemm: d_x = out_grad * transpose(kernel)
+  // (n, out_channels) * (out_channels, in_channels)
+  T* x_grad_ptr = x_grad->data<T>();
+  blas.GEMM(CblasNoTrans,
+            CblasTrans,
+            out_grad.dims()[0],
+            in_channels,
+            out_grad.dims()[1],
+            static_cast<T>(1),
+            out_grad.data<T>(),
+            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+            static_cast<T>(0),
+            x_grad_ptr);
+}
+
+inline const std::vector<int> PoolResetKernel(
+    const std::vector<int>& kernel_sizes,
+    const int in_channels,
+    const int out_channels) {
+  std::vector<int> res(kernel_sizes);
+  res.resize(5);
+  res[3] = in_channels;
+  res[4] = out_channels;
+  return res;
+}
+
+inline void PrefixSum(const int* counter, int* offsets, const int n) {
+  int offset = 0;
+  for (int i = 0; i < n; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[n] = offset;
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2b6f1e559d2b5bfc333b60359f5b1e56e9aaadb
--- /dev/null
+++ b/paddle/phi/kernels/funcs/tril_triu_compute.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class TrilTriuCompute {
+ public:
+  HOSTDEVICE TrilTriuCompute(const T* in,
+                             const int diagonal,
+                             const bool lower,
+                             const int64_t H,
+                             const int64_t W,
+                             T* out)
+      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    const int64_t row = (idx / W_) % H_;
+    const int64_t col = idx % W_;
+    const bool mask =
+        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
+    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
+  }
+
+ private:
+  const T* in_;
+  const int diagonal_;
+  const bool lower_;
+  const int64_t H_;
+  const int64_t W_;
+  T* out_;
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index b3189fc5cc3c307f04758663250098f384c2c8fc..336e9c809427c68be79bc8eaddd98193462f5405 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -20,7 +20,6 @@
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
diff --git a/paddle/phi/kernels/gather_grad_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e53da7b471c7b82efef2319915cc57537ee824b5
--- /dev/null
+++ b/paddle/phi/kernels/gather_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_kernel.h b/paddle/phi/kernels/gather_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..78ac09125b69298c59622fc69469ba8d28cae919
--- /dev/null
+++ b/paddle/phi/kernels/gather_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h
index e5a1a684daef099b5da8e7d9b8469b2857c29a6b..b3e6ffbc4297a2ae6a067e6b1ec5f2f88f7ef2ba 100644
--- a/paddle/phi/kernels/gather_tree_kernel.h
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd70e8d54bc8d004373efd1874f4b07a9ebde6a8
--- /dev/null
+++ b/paddle/phi/kernels/gelu_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc106a04031fbcc2a96209e170d60eda8cc7b5e1
--- /dev/null
+++ b/paddle/phi/kernels/gelu_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define GELU_CONSTANT 0.044715
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index c2995c79a7e8c2651ed4aa16d75d59c8f24c96dc..c912d0c4686ff3fee88925f4d7121f38f24a5485 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -73,119 +73,158 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
   }
 }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradGPUImpl<T, Context, functor_class>(               \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(     \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(         \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(       \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(       \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor);
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               CudaLeakyReluGradFunctor,
+                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               CudaThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               CudaSoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               CudaHardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               CudaBReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 CudaHardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  std::vector<const DenseTensor*> ins = {&dout, &out};
+  std::vector<DenseTensor*> outs = {dx};
+  if (alpha > 0) {
+    funcs::CudaELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::CudaELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    ins.push_back(&x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
 
 }  // namespace phi
-PD_REGISTER_KERNEL(cos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(tan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
@@ -219,3 +258,45 @@ PD_REGISTER_KERNEL(relu_double_grad,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                             \
+                     GPU,                              \
+                     ALL_LAYOUT,                       \
+                     phi::func,                        \
+                     float,                            \
+                     double,                           \
+                     phi::dtype::float16,              \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
+                                   LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 26752b89e7c345f88cdbe2000b119c07507d2c37..6b598c764debb059072ba3ae3ac90e6985479133 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -38,26 +38,77 @@ void ActivationGPUImpl(const Context& dev_ctx,
   funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
 }
 
-#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
-  template <typename T, typename Context>                                   \
-  void name##Kernel(                                                        \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
-    functor_class functor;                                                  \
-    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    float attr1,                            \
+                    float attr2,                            \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
+
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     CudaThresholdedReluFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
+                                     CudaHardShrinkFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
+
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     CudaHardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
 
@@ -79,65 +130,37 @@ PD_REGISTER_KERNEL(relu,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
-PD_REGISTER_KERNEL(
-    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                        \
+                     GPU,                         \
+                     ALL_LAYOUT,                  \
+                     phi::func,                   \
+                     float,                       \
+                     double,                      \
+                     phi::dtype::float16,         \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2612bb10c9fe108a471253ff87f2a686059c2a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AllcloseCUDAKernel(const T* in_data,
+                                   const T* other_data,
+                                   const double rtol,
+                                   const double atol,
+                                   bool equal_nan,
+                                   int num,
+                                   bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    if (!val) *out_data = false;
+  }
+}
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  const T* in_data = x.data<T>();
+  const T* other_data = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  bool* out_data = dev_ctx.template Alloc<bool>(out);
+
+  int num = x.numel();
+  int block = 1024;
+  int grid = (block - 1 + num) / block;
+  grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(out_data, true, sizeof(bool));
+#else
+  cudaMemset(out_data, true, sizeof(bool));
+#endif
+  AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      in_data, other_data, rtol_v, atol_v, equal_nan, num, out_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, GPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e8712462928d43077766a2fd03aee51d9a4cd8c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -0,0 +1,320 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include <thrust/transform.h>
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T>
+struct CumprodGradFunctorExceptFirstZero {
+  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
+      const T *x,
+      const T *y,
+      const T *dy_mul_y_reversed_cumsum,
+      const uint8_t *zero_mask,
+      size_t mid_dim,
+      size_t inner_dim,
+      T *dx,
+      int64_t *first_zero_idx,
+      T *x_filled_one)
+      : x_(x),
+        y_(y),
+        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
+        zero_mask_(zero_mask),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx),
+        first_zero_idx_(first_zero_idx),
+        x_filled_one_(x_filled_one) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto inner_idx = idx % inner_dim_;
+    auto outer_idx = idx / (mid_dim_ * inner_dim_);
+    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
+    auto mask = zero_mask_[idx];
+    bool should_fill_one = true;
+
+    if (mask == 0) {
+      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
+      if (mid_idx == mid_dim_ - 1) {
+        // record first zero position as -1, i.e., no zero
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
+      }
+    } else if (mid_idx > 0) {                  // mask > 0
+      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
+        dx_[idx] = 0;
+        should_fill_one = false;
+      } else {
+        // idx is the first zero position, it should be recorded
+        dx_[idx] = y_[idx - inner_dim_];
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
+      }
+    } else {  // the first zero position is index 0
+      dx_[idx] = 1;
+      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
+    }
+
+    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  const T *dy_mul_y_reversed_cumsum_;
+  const uint8_t *zero_mask_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+  int64_t *first_zero_idx_;
+  T *x_filled_one_;
+};
+
+template <typename T>
+struct FillFirstZeroPositionGradFunctor {
+  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
+                                              const T *grad_value,
+                                              size_t mid_dim,
+                                              size_t inner_dim,
+                                              T *dx)
+      : first_zero_idx_(first_zero_idx),
+        grad_value_(grad_value),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    auto mid_idx = first_zero_idx_[idx];
+    if (mid_idx >= 0) {
+      auto full_idx =
+          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
+      dx_[full_idx] *= grad_value_[full_idx];
+    }
+  }
+
+ private:
+  const int64_t *first_zero_idx_;
+  const T *grad_value_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+};
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &out,
+                       const DenseTensor &dout,
+                       int dim,
+                       DenseTensor *dx) {
+  const auto *y = &out;
+  const auto *dy = &dout;
+
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  const auto *x_data = x.data<T>();
+  const auto *y_data = y->data<T>();
+  const auto *dy_data = dy->data<T>();
+
+  auto place = dev_ctx.GetPlace();
+  auto *dx_data = dev_ctx.template Alloc<T>(dx);
+
+  // deal with complex
+  const T *x_data_deal;
+  const T *y_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr y_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
+    y_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_y(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
+    for_range_y(functor_y);
+    x_data_deal = x_data_conj;
+    y_data_deal = y_data_conj;
+  } else {
+    x_data_deal = x_data;
+    y_data_deal = y_data;
+  }
+
+// Step 1: find cummax-ed zero mask of x
+#ifdef PADDLE_WITH_CUDA
+  const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
+#endif
+  auto zero_mask_without_cummax =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_without_cummax_data =
+      reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(x_data_deal),
+                    thrust::device_pointer_cast(x_data_deal) + numel,
+                    thrust::device_pointer_cast(zero_mask_without_cummax_data),
+                    funcs::IsZeroFunctor<T>());
+
+  auto zero_mask = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                       .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
+  paddle::operators::math::InclusiveScan<uint8_t, cub::Max>(
+      zero_mask_without_cummax_data,
+      zero_mask_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<uint8_t>(0),
+      cub::Max(),
+      /*reverse=*/false,
+      dev_ctx);
+  zero_mask_without_cummax = nullptr;
+
+  // Step 2: calculate reversed cumsum(dy * y)
+  auto dy_mul_y = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                      .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(y_data_deal),
+                    thrust::device_pointer_cast(dy_mul_y_data),
+                    funcs::MultiplyFunctor<T>());
+
+  auto dy_mul_y_reversed_cumsum =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_reversed_cumsum_data =
+      reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_y_data,
+      dy_mul_y_reversed_cumsum_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 3: calculate the gradient value except the first zero position.
+  // The gradient value of the first zero position is filled with out[idx-1],
+  // while the gradient value of the other positions are calculated out
+  // completely. This functor also:
+  //  (1) find the first zero index, i.e., first_zero_idx_data.
+  //  (2) fill x_filled_one, which satifies
+  //      x_filled_one[i] = x[i], i > pos
+  //      x_filled_one[i] = 1, i <= pos
+  auto first_zero_idx = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                            .Allocate(numel * sizeof(int64_t));
+  auto *first_zero_idx_data =
+      reinterpret_cast<int64_t *>(first_zero_idx->ptr());
+  auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
+      x_data_deal,
+      y_data_deal,
+      dy_mul_y_reversed_cumsum_data,
+      zero_mask_data,
+      mid_dim,
+      inner_dim,
+      dx_data,
+      first_zero_idx_data,
+      x_filled_one_data);
+  for_range(functor_except_first_zero);
+
+  // Step 4: calculate cumprod of x_filled_one
+  auto *x_filled_one_cumprod_data =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, funcs::MultiplyFunctor<T>>(
+      x_filled_one_data,
+      x_filled_one_cumprod_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(1),
+      funcs::MultiplyFunctor<T>(),
+      /*reverse=*/false,
+      dev_ctx);
+
+  // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
+  auto *dy_mul_x_filled_one_cumprod =
+      dy_mul_y_data;  // reuse former allocated memory
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(x_filled_one_cumprod_data),
+                    thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
+                    funcs::MultiplyFunctor<T>());
+  auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_x_filled_one_cumprod,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 6: fill zero pos gradient value
+  phi::funcs::ForRange<Context> for_range_fill_zero_pos_grad(
+      dev_ctx, outer_dim * inner_dim);
+  FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
+      first_zero_idx_data,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      mid_dim,
+      inner_dim,
+      dx_data);
+  for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bbf8972a24798e5ddebf4dac2b3745eb1a2aee0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context &dev_ctx,
+                   const DenseTensor &input,
+                   int dim,
+                   DenseTensor *out) {
+  const auto *x = &input;
+  auto *y = out;
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+
+  const auto *x_data = x->data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  paddle::operators::math::InclusiveScan(x_data,
+                                         y_data,
+                                         outer_dim,
+                                         mid_dim,
+                                         inner_dim,
+                                         static_cast<T>(1),
+                                         funcs::MultiplyFunctor<T>(),
+                                         /*reverse=*/false,
+                                         dev_ctx);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1db6e1b7cf73375f2617c727a26e5768922777d4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void ModulatedDeformableIm2colGpuKernel(
+    const int nthreads,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
+
+  ModulatedDeformableIm2colGpuKernel<
+      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                   data_im,
+                                                   data_offset,
+                                                   data_mask,
+                                                   im_shape[1],
+                                                   im_shape[2],
+                                                   filter_shape[2],
+                                                   filter_shape[3],
+                                                   paddings[0],
+                                                   paddings[1],
+                                                   strides[0],
+                                                   strides[1],
+                                                   dilations[0],
+                                                   dilations[1],
+                                                   channel_per_deformable_group,
+                                                   col_shape[1],
+                                                   im_shape[0],
+                                                   deformable_groups,
+                                                   col_shape[2],
+                                                   col_shape[3],
+                                                   data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cce12a87fac72f5ac6edbbeb74de9fe3ae9ede09
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25184083873952638a1f84d8d4b66262363ca9c6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..65bf837e6cf8a330fbb744c994311d17a7cc6299
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+// Extract the diagonal of a matrix 'dout' to a matrix 'dx'
+template <typename T>
+__global__ void ExtractDiagonalKernel(const T* dout,
+                                      T* dx,
+                                      std::ptrdiff_t start,
+                                      std::ptrdiff_t dx_length,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < dx_length;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    dx[xStride * idx] = dout[outOffset];
+  }
+}
+
+// Paste a vector 'dout' to the diagonal of a matrix 'dx'
+template <typename T>
+__global__ void PasteDiagonalKernel(const T* dout,
+                                    T* dx,
+                                    std::ptrdiff_t start,
+                                    std::ptrdiff_t size,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    std::ptrdiff_t xOffset = start + sumStride * idx;
+    dx[xOffset] = dout[outStride * idx];
+  }
+}
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  auto* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+    const int64_t block_size =
+        std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+    int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (size + block_size - 1) / block_size);
+    return std::tuple<int64_t, int64_t>{block_size, grid_size};
+  };
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    auto size = (offset > 0) ? dx_length + offset : dx_length - offset;
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+    if (size > 0) {
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+      auto start =
+          (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                                 std::get<0>(block_grid_size),
+                                 0,
+                                 dev_ctx.stream()>>>(
+          dout_data,
+          dx_data,
+          start,
+          dx_length,
+          dout_stride_0 + dout_stride_1,
+          dx_stride);
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    int64_t size;
+    if (offset > 0) {
+      size = std::min(dx_dims[0], dx_dims[1] - offset);
+    } else {
+      size = std::min(dx_dims[0] + offset, dx_dims[1]);
+    }
+
+    if (size > 0) {
+      auto start = (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      PasteDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                               std::get<0>(block_grid_size),
+                               0,
+                               dev_ctx.stream()>>>(dout_data,
+                                                   dx_data,
+                                                   start,
+                                                   size,
+                                                   dx_stride_0 + dx_stride_1,
+                                                   dout_stride_0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index fc70639787173d84b69262245dbb0500aa179a90..95d3d3365d91be61013e2016d06334f0498d866a 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -130,5 +130,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {}
+PD_REGISTER_KERNEL(diag,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
index fdf61dc73991d84d4b38ddd214e1abf80cb2798e..5e33966055ea07d9b70227a5ed4760ad3b21e1a8 100644
--- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/eigh_grad_kernel.h"
 #include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 81f7fac10880325e152f37b5d4ab783ae93a279c..c4481bf6ce3c33ea260d774d0ac240a166856388 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -282,3 +282,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
similarity index 60%
rename from paddle/phi/kernels/gpu/math_kernel.cu
rename to paddle/phi/kernels/gpu/elementwise_kernel.cu
index af9d5574aa9feaf4d44482bbf0e75f31a5139595..a57d89013f921e3adb5587c70b7bbb12c383de61 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -1,37 +1,22 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/math_kernel.h"
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
 
@@ -56,30 +41,6 @@ namespace phi {
  * Kernels
  */
 
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 // Create the definition of Add
 DEFINE_CUDA_ELEMENTWISE_OP(Add)
 // Create the definition of Subtract
@@ -96,6 +57,24 @@ using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
@@ -147,30 +126,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
-PD_REGISTER_KERNEL(mean_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanRawKernel,
-                   float,
-                   double,
-                   bool,
-                   float16,
-                   int,
-                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..04149a2f9ee41e797a66eedcb2d797fb87519041
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == DataType::INT32) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+  if (index_type == DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e0c6cc168564e94c5af2e26a8f9ba4acc0594ed
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2CUDAFunction<T, int32_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2CUDAFunction<T, int64_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT16) {
+      phi::funcs::GatherV2CUDAFunction<T, int16_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) return;
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGather<T, int64_t>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT16) {
+    phi::funcs::GPUGather<T, int16_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index a9e73ec37c8ed5f064144e27b06ac6304f5694b3..2906b81cb40096855fc990040f8d23b832f4da2e 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
 #include <algorithm>
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gather_tree_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b9be7c6154354f7fd20b316610521a02801243f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+#ifdef __NVCC__
+template <bool FastMode>
+static __device__ __forceinline__ float FP32FastTanh(float x) {
+#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
+  if (FastMode) {
+    float y;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
+    return y;
+  }
+#endif
+  return tanhf(x);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluFwd(float x) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  return x * 0.5f * (1.0f + tanh_out);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
+                         (0.79788456f + 0.1070322243f * x * x)) +
+             0.5f * (1.0f + tanh_out);
+  return tmp * y_g;
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x,
+                                                 __half* y,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      float tmp = __half2float(in_arr[i]);
+      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
+    }
+    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
+  }
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
+                                                 const __half* y_g,
+                                                 __half* x_g,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      __half2 tmp_fp16_2;
+      tmp_fp16_2.x = x_in_arr[i];
+      tmp_fp16_2.y = y_g_in_arr[i];
+      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
+      x_in_arr[i] =
+          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
+    }
+    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
+  }
+}
+
+static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(y, kAlignment)) {                                          \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluFwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
+  return false;
+}
+
+static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx,
+    const __half* x,
+    const __half* y_g,
+    __half* x_g,
+    size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
+        is_aligned(x_g, kAlignment)) {                                        \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluBwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(           \
+          x, y_g, x_g, n);                                                    \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
+  return false;
+}
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e21f8d4267bca5363d58b63e0a37d076b4d06af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    MPType kBeta =
+        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
+    auto cube_x = x * x * x;
+    auto tanh_out =
+        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
+    auto ans =
+        half * (one + tanh_out +
+                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
+    return static_cast<T>(ans * dout);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
+    const MPType cdf = normcdf(x);
+    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
+    return static_cast<T>(dout * (cdf + x * pdf));
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  std::vector<const DenseTensor*> ins = {&x, &out_grad};
+  std::vector<DenseTensor*> outs = {x_grad};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* x_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      const auto* y_g_ptr = reinterpret_cast<const __half*>(out_grad.data<T>());
+      auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data<T>());
+      if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+              dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ce6dda2d6cc6526853cf563779cfe5ad1a21ffe1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // this function is tanh approximation of gelu
+    MPType x = static_cast<MPType>(arg_x);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    auto tanh_out =
+        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
+    MPType out = x * half * (one + tanh_out);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // actual gelu with approximation = false
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x * normcdf(x));
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* in_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
+      if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+              dev_ctx, in_ptr, out_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..457a348be832b006d9f224e3032c369a7fe4bb62
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -0,0 +1,324 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ void AtomicAdd(
+    T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
+  if (InBounds(h, w, H, W)) {
+    paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        int clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         int size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSamplerCudaBackwardKernel(const int nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              int n,
+                                              int out_c,
+                                              int out_h,
+                                              int out_w,
+                                              int in_h,
+                                              int in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c,
+               inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC,
+               gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  T* grid_grad_data = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+  }
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSamplerCudaBackwardKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      out_grad.data<T>(),
+      x.data<T>(),
+      grid.data<T>(),
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x_grad->data<T>(),
+      grid_grad_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f611b46911c4f1555ad27a538d8918f11ae761cc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                int size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   int twice_low,
+                                                   int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     int size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = ReflectIndexes(coord, -1, 2 * size - 1);
+    }
+    coord = ClipIndexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSampleCudaKernel(const int nthreads,
+                                     int n,
+                                     int out_c,
+                                     int out_h,
+                                     int out_w,
+                                     int in_h,
+                                     int in_w,
+                                     const T* input,
+                                     const T* grid,
+                                     T* output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+          << "; out_w: " << out_w;
+
+  auto* output_data = dev_ctx.template Alloc<T>(out);
+  VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+          << out->dims()[2] << "; " << out->dims()[3];
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSampleCudaKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x.data<T>(),
+      grid.data<T>(),
+      output_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..098eb9defb54904c41f33326b54eabdda657360a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
+static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a393eecd51242193fa3b2192ff8e8f1111d350b6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_grad_cuda_kernel(const T* output_grad,
+                                              T* input_grad,
+                                              const IndexT* index,
+                                              int64_t nums,
+                                              int64_t N,
+                                              int64_t stride,
+                                              int64_t size,
+                                              int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+}
+
+template <typename T>
+__global__ void index_select_grad_init(T* input_grad, int64_t N) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  input_grad[idx] = 0.0;
+}
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  auto* output_grad_data = out_grad.data<T>();
+  auto* in_grad_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = x_grad->dims();
+  auto output_dim = out_grad.dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  int64_t numel = x_grad->numel();
+  int64_t index_nums = index.numel();
+  int64_t out_nums = out_grad.numel();
+
+  auto stream = ctx.stream();
+
+  index_select_grad_init<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(in_grad_data, numel);
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_grad_cuda_kernel<T, int64_t><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_grad_cuda_kernel<T, int><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f774522318acb8f44798030870886dd1dc7accc1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_cuda_kernel(const T* input,
+                                         T* output,
+                                         const IndexT* index,
+                                         int64_t N,
+                                         int64_t stride,
+                                         int64_t size,
+                                         int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  output[idx] = input[input_idx];
+}
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto input_dim = x.dims();
+  auto output_dim = output->dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(output);
+
+  int64_t numel = output->numel();
+  auto stream = ctx.stream();
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_cuda_kernel<T, int64_t><<<
+        (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_cuda_kernel<
+        T,
+        int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS,
+               0,
+               stream>>>(
+        in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34774ec715c48de953945f94624c2c3cfe742d30
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ca53f021f054278b813fedb69db31f4d2c5aaf6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9388ac7071c3197c2a8774ffd5f0d670cdf8a8b2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13ef2adaab3f32791e5c108b3f12b217e5dcea07
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a2124fd5af7d79cf6d1227a73105dd3e5b729547
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f6e96046a2bd799f4a6b8d30a239afb505582deb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+static int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  const T* out_grad_data = d_out.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+  int pre, n, post;
+  paddle::operators::GetDims(in_dims, axis, &pre, &n, &post);
+  int block_size = getBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  paddle::operators::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4218e153ec29bd1757b2405f0af638040de9bff2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -0,0 +1,252 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+inline int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+bool SortKthvalue(const phi::GPUContext& dev_ctx,
+                  const DenseTensor* input_tensor,
+                  const int64_t num_cols,
+                  const int64_t num_rows,
+                  const int k,
+                  DenseTensor* out_tensor,
+                  DenseTensor* indices_tensor) {
+  auto cu_stream = dev_ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  dev_ctx.template Alloc<int64_t>(&input_indices);
+  size_t temp_storage_bytes = -1;
+  int block_size = getBlockSize(num_cols);
+  unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  paddle::operators::InitIndex<
+      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  cub::TransformInputIterator<int64_t,
+                              paddle::operators::SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter,
+                        paddle::operators::SegmentOffsetIter(num_cols));
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+  DenseTensor temp_values, temp_indices;
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  temp_values.Resize(dim);
+  temp_indices.Resize(dim);
+  sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
+  sorted_indices_ptr = dev_ctx.template Alloc<int64_t>(&temp_indices);
+  auto err =
+      cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                               temp_storage_bytes,
+                                               input,
+                                               sorted_values_ptr,
+                                               input_indices.data<int64_t>(),
+                                               sorted_indices_ptr,
+                                               num_cols * num_rows,
+                                               num_rows,
+                                               segment_offsets_t,
+                                               segment_offsets_t + 1,
+                                               0,
+                                               sizeof(T) * 8,
+                                               cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  DenseTensor temp_storage;
+  temp_storage.Resize({static_cast<int>(temp_storage_bytes / sizeof(uint8_t))});
+  uint8_t* temp_storage_data = dev_ctx.template Alloc<uint8_t>(&temp_storage);
+
+  err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data,
+                                                 temp_storage_bytes,
+                                                 input,
+                                                 sorted_values_ptr,
+                                                 input_indices.data<int64_t>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  auto& dev = *dev_ctx.eigen_device();
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
+  auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+  auto e_tmp_indices =
+      EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
+  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
+  dim = phi::make_ddim(odims);
+  auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+  auto e_tmp_values =
+      EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
+
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
+  return true;
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  auto out_dims = output->dims();
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(
+            dev_ctx, &x, input_width, input_height, k, output, indices),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    return;
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_input;
+    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans);
+    DenseTensor trans_ind, trans_out;
+    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
+    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(dev_ctx,
+                        &trans_input,
+                        input_width,
+                        input_height,
+                        k,
+                        &trans_out,
+                        &trans_ind),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c3f7a5261712a1d33bb4ad47dd080a489b303717
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         paddle::optional<const DenseTensor &> scale_opt,
+                         paddle::optional<const DenseTensor &> bias_opt,
+                         const DenseTensor &out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  // d_x, d_scale, d_bias may be nullptr
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+  auto *d_y = &out_grad;
+
+  const auto &x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto *x_data = x.data<T>();
+  auto *d_y_data = d_y->data<T>();
+
+  auto *mean_data = mean.data<U>();
+  auto *var_data = variance.data<U>();
+
+  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+
+  auto x_dtype = x.dtype();
+
+  phi::DataType scale_bias_dtype;
+  if (scale != nullptr) {
+    scale_bias_dtype = scale->dtype();
+  } else {
+    // FIXME(zengjinle): do not find a better way to get the right
+    // data type of the d_scale and d_bias if scale == nullptr.
+    if (bias != nullptr) {
+      scale_bias_dtype = bias->dtype();
+    } else {
+      scale_bias_dtype = x_dtype;
+    }
+  }
+
+#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
+  do {                                                                      \
+    auto *scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto *d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto *d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto *d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    paddle::operators::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(  \
+        x_data,                                                             \
+        d_y_data,                                                           \
+        scale_data,                                                         \
+        mean_data,                                                          \
+        var_data,                                                           \
+        d_x_data,                                                           \
+        d_scale_data,                                                       \
+        d_bias_data,                                                        \
+        epsilon,                                                            \
+        batch_size,                                                         \
+        feature_size,                                                       \
+        dev_ctx);                                                           \
+  } while (0)
+
+  if (scale_bias_dtype == x_dtype) {
+    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+  } else {
+    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_BWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d87b7c2193811cd6cf8138d1904c7fce01d3884a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T>
+void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                               const T *input,
+                                               std::vector<int> input_shape,
+                                               const T *bias,
+                                               const T *scale,
+                                               T *output,
+                                               T *mean,
+                                               T *variance,
+                                               int begin_norm_axis,
+                                               float eps) {
+  const auto x_dims = phi::make_ddim(input_shape);
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
+                         T,
+                         T,
+                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+        input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class LayerNormDirectCUDAFunctor<float>;
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context &dev_ctx,
+                     const DenseTensor &x,
+                     paddle::optional<const DenseTensor &> scale_opt,
+                     paddle::optional<const DenseTensor &> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor *y,
+                     DenseTensor *mean,
+                     DenseTensor *var) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto *x_data = x.data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  auto *mean_data = dev_ctx.template Alloc<U>(mean);
+  auto *var_data = dev_ctx.template Alloc<U>(var);
+
+  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+
+  auto x_dtype = x.dtype();
+  phi::DataType scale_bias_dtype;
+  if (void_scale_data != nullptr) {
+    scale_bias_dtype = scale->dtype();
+    if (void_bias_data != nullptr) {
+      PADDLE_ENFORCE_EQ(
+          scale->dtype(),
+          bias->dtype(),
+          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+                                       "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      paddle::experimental::CppTypeToDataType<U>::Type(),
+                      phi::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto stream = dev_ctx.stream();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
+  do {                                                                     \
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
+      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
+                           T,                                              \
+                           U,                                              \
+                           kBlockDim,                                      \
+                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
+                                                        kBlockDim,         \
+                                                        0,                 \
+                                                        stream>>>(         \
+          x_data,                                                          \
+          static_cast<const ScaleBiasT *>(void_scale_data),                \
+          static_cast<const ScaleBiasT *>(void_bias_data),                 \
+          y_data,                                                          \
+          mean_data,                                                       \
+          var_data,                                                        \
+          epsilon,                                                         \
+          feature_size));                                                  \
+      default:                                                             \
+        PADDLE_THROW(phi::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1")); \
+        break;                                                             \
+    }                                                                      \
+  } while (0)
+
+#ifdef PADDLE_WITH_CUDA
+  bool can_call_1024_kernel = false;
+  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+    can_call_1024_kernel = true;
+  }
+  if (can_call_1024_kernel) {
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int THREADS_PER_WARP = 32;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    const int grid = static_cast<int>(
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+    if (is_scale_bias_same_dtype_with_x) {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          T,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const T *>(void_scale_data),
+          static_cast<const T *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    } else {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          U,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const U *>(void_scale_data),
+          static_cast<const U *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    }
+  } else {
+#endif
+    if (is_scale_bias_same_dtype_with_x) {
+      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    } else {
+      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+    }
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3e4cd21a658f103aca9bc611a2d42518245e4401
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/gpu/lgamma_kernel.cu
similarity index 54%
rename from paddle/phi/kernels/cpu/reduce_max_kernel.cc
rename to paddle/phi/kernels/gpu/lgamma_kernel.cu
index f9ea0aa0faf06918253f9037282b924199e3a314..e94d67f4ce324ad9d8237a377d70a920cdbd30af 100644
--- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -12,28 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/lgamma_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
-
+template <typename T>
+struct CudaLgammaFunctor {
+  __device__ __forceinline__ T operator()(const T x) const {
+    return Eigen::numext::lgamma(x);
+  }
+};
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void LgammaKernel(const Context& dev_ctx,
                   const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  // XKTODO( add gpu kernel implementation. )
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = CudaLgammaFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
 }
-
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7b282536558db524c082de11c7ca92b2bd98edc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context &dev_ctx,
+                          const DenseTensor &out,
+                          const DenseTensor &out_grad,
+                          int axis,
+                          DenseTensor *x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::SoftmaxBackwardCUDAKernelDriver<T, true>(
+      dev_ctx, out, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d7e34c6c14e7a49f50c016d888f6fb875dca0776
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      int axis,
+                      DenseTensor *out) {
+  dev_ctx.template Alloc<T>(out);
+  phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index fc4adca2f42438f464346ad83bc7e49448826bb2..b443ae6b8fb5e6c3bf5264a50d25205a419f22ad 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -19,34 +19,27 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectFunctor {
+  HOSTDEVICE MaskedSelectFunctor() {}
 
-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask,
-                                     const T* input,
-                                     T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[store_fix++] = value[idx];
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectKernel(const Context& dev_ctx,
@@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         "value.",
                         input_dim,
                         mask_dim));
-
-  thrust::device_ptr<const bool> mask_dev_ptr =
-      thrust::device_pointer_cast(mask_data);
-  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-
-  DDim out_dim{out_size};
-  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArray<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  using Functor = MaskedSelectFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 1, Functor>(
+      dev_ctx, mask, x, out, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9b889a9b4c0069efcbf38a10ce00f20072560a36
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66ba30f7ce6945693a974733c77a47f0d328e50b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void GesvdjBatched(const phi::GPUContext& dev_ctx,
+                   int batchSize,
+                   int m,
+                   int n,
+                   int k,
+                   T* A,
+                   T* U,
+                   T* V,
+                   T* S,
+                   int* info,
+                   int thin_UV = 1);
+
+template <typename T>
+void SyevjBatched(const phi::GPUContext& dev_ctx,
+                  int batchSize,
+                  int n,
+                  T* A,
+                  T* W,
+                  int* info);
+
+template <>
+void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int m,
+                          int n,
+                          int k,
+                          float* A,
+                          float* U,
+                          float* V,
+                          float* S,
+                          int* info,
+                          int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnSgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
+                           int batchSize,
+                           int m,
+                           int n,
+                           int k,
+                           double* A,
+                           double* U,
+                           double* V,
+                           double* S,
+                           int* info,
+                           int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    // check the error info
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
+                         int batchSize,
+                         int n,
+                         float* A,
+                         float* W,
+                         int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  // matrix is saved as column-major in cusolver.
+  // numpy and torch use lower triangle to compute eigenvalues, so here use
+  // upper triangle
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <>
+void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int n,
+                          double* A,
+                          double* W,
+                          int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  //  upper triangle of A is stored
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  // Must Copy X once, because the gesvdj will destory the content when exit.
+  DenseTensor x_tmp;
+  paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp);
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches);
+  int* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    SyevjBatched<T>(
+        dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data, info_ptr);
+
+    phi::AbsKernel<T, Context>(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor);
+
+  } else {
+    DenseTensor U, VH;
+    U.Resize(detail::GetUDDim(dim_x, k));
+    VH.Resize(detail::GetVHDDim(dim_x, k));
+    auto* u_data = dev_ctx.template Alloc<T>(&U);
+    auto* vh_data = dev_ctx.template Alloc<T>(&VH);
+    GesvdjBatched<T>(dev_ctx,
+                     batches,
+                     cols,
+                     rows,
+                     k,
+                     x_tmp.data<T>(),
+                     vh_data,
+                     u_data,
+                     eigenvalue_data,
+                     info_ptr,
+                     1);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+
+  int axis = -1;
+  funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int64_t>(
+      dev_ctx,
+      eigenvalue_tensor,
+      tol_tensor,
+      axis,
+      funcs::GreaterThanFunctor<T, int64_t>(),
+      &compare_result);
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank_tol,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankTolKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43502621c2d3a878a144de1878aa09b8d64b6a47
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out,
+                                   const int64_t* indices,
+                                   T* grad_in,
+                                   int pre,
+                                   int post,
+                                   int raw_height,
+                                   int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  if (axis < 0) axis += in_dims.size();
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  funcs::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  int block_size = funcs::ComputeBlockSize(post);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..629b9722cd6bcfe12d0fb5a7e8be6439f5ea286f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  // get the input dims
+  const auto& in_dims = x.dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  auto out_dims = out->dims();
+
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetModebySort<T>(
+        dev_ctx, &x, input_width, input_height, output_data, indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (int i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+
+    int ndims = trans_axis.size();
+    funcs::TransCompute<Context, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_out_shape);
+    int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    DenseTensor trans_out;
+    trans_out.Resize(trans_out_shape);
+    T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    funcs::GetModebySort<T>(dev_ctx,
+                            &trans_input,
+                            input_width,
+                            input_height,
+                            trans_out_data,
+                            trans_ind_data);
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<Context, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans_axis);
+    funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..21576ab608d269340322782c8113c6054c791e74
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T),
+                           stream);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..743448a46866687cf2ac68be522a306281289252
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T),
+                         stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32c7fa1e85d150b99e7a05d169b01cd8727c1a98
--- /dev/null
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data,
+                                 OutT* p_out_data,
+                                 const int64_t numel,
+                                 const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotV2OpCUDAFunctor {
+  const DenseTensor* in_;
+  DenseTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotV2OpCUDAFunctor(const DenseTensor* in,
+                        DenseTensor* out,
+                        int depth,
+                        const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    auto stream = ctx_.stream();
+    funcs::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS,
+                       0,
+                       stream>>>(p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
+  }
+
+  phi::VisitDataType(
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5ab6a1ccd49f2a88835bf1dd63c2d874db4e2a7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e8641395bef927b7e8f7c9ba522af84c0b34680e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..76ee9439a2050b000b5cffd1df47581141a874c7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_funcs.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+#define CUDA_NUM_THREADS 1024
+
+inline static int PADDLE_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void PReluChannelFirstWiseKernel(const T *input,
+                                            const T *alpha,
+                                            T *output,
+                                            size_t channel_num,
+                                            size_t plane_size,
+                                            size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t temp = index / plane_size;
+    size_t channel_index = temp % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluChannelLastWiseKernel(const T *input,
+                                           const T *alpha,
+                                           T *output,
+                                           size_t channel_num,
+                                           size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t channel_index = index % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluElementWiseKernel(const T *input,
+                                       const T *alpha,
+                                       T *output,
+                                       size_t spatial_size,
+                                       size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t element_index = index % spatial_size;
+    T scale = alpha[element_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluScalarKernel(const T *input,
+                                  const T *alpha,
+                                  T *output,
+                                  size_t numel) {
+  T scale = alpha[0];
+  CUDA_KERNEL_LOOP(index, numel) {
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+class PreluChannelWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t channel,
+                  bool channel_last,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluElementWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluScalarDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t numel);
+};
+
+template <typename T>
+void PreluChannelWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t channel,
+                                                      bool channel_last,
+                                                      size_t numel) {
+  if (channel_last) {
+    PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(
+        input, alpha, output, channel, numel);
+  } else {
+    PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(
+        input, alpha, output, channel, numel / batch_size / channel, numel);
+  }
+}
+
+template <typename T>
+void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t numel) {
+  PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                           CUDA_NUM_THREADS,
+                           0,
+                           stream>>>(
+      input, alpha, output, numel / batch_size, numel);
+}
+
+template <typename T>
+void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                 const T *input,
+                                                 const T *alpha,
+                                                 T *output,
+                                                 size_t numel) {
+  PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, numel);
+}
+
+template class PreluChannelWiseDirectCUDAFunctor<float>;
+template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<double>;
+
+template class PreluElementWiseDirectCUDAFunctor<float>;
+template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluElementWiseDirectCUDAFunctor<double>;
+
+template class PreluScalarDirectCUDAFunctor<float>;
+template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluScalarDirectCUDAFunctor<double>;
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d8661268e82c35f48d9877120574628c4325ae4e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar };
+
+template <typename T>
+__global__ void PReluOpGradKernel(const T* x_ptr,
+                                  const T* alpha_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  T* alpha_grad_ptr,
+                                  size_t channel_num,
+                                  size_t plane_size,
+                                  size_t spatial_size,
+                                  size_t numel,
+                                  PRELU_MODE mode) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale;
+    if (mode == Element) {
+      size_t element_index = index % spatial_size;
+      scale = alpha_ptr[element_index];
+    } else if (mode == ChannelFirst) {
+      size_t temp = index / plane_size;
+      size_t channel_index = temp % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else if (mode == ChannelLast) {
+      size_t channel_index = index % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else {
+      scale = alpha_ptr[0];
+    }
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    if (x_grad_ptr != nullptr)
+      x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad;
+    if (alpha_grad_ptr != nullptr)
+      alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad;
+  }
+}
+
+template <typename T>
+class PreluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* alpha,
+                  const T* out_grad,
+                  T* x_grad,
+                  T* alpha_grad,
+                  const DDim& input_dims,
+                  PRELU_MODE mode) {
+    size_t numel = 1;
+    for (size_t i = 0; i < input_dims.size(); ++i) {
+      numel *= input_dims[i];
+    }
+    size_t plane_size = numel / input_dims[0] / input_dims[1];
+    size_t spatial_size = numel / input_dims[0];
+    size_t channel =
+        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
+
+    PReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x,
+        alpha,
+        out_grad,
+        x_grad,
+        alpha_grad,
+        channel,
+        plane_size,
+        spatial_size,
+        numel,
+        mode);
+  }
+};
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad ? dev_ctx.template Alloc<T>(x_grad) : nullptr;
+  T* alpha_grad_ptr =
+      alpha_grad ? dev_ctx.template Alloc<T>(alpha_grad) : nullptr;
+
+  if (!x_grad && !alpha_grad) return;
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+  std::vector<int> input_shape = phi::vectorize<int>(dim);
+  auto stream = dev_ctx.stream();
+
+  T* alpha_grad_tmp_ptr;
+  DenseTensor alpha_grad_tmp;
+  if (alpha_grad_ptr == nullptr) {
+    alpha_grad_tmp_ptr = alpha_grad_ptr;
+  } else {
+    DenseTensorMeta alpha_grad_meta(
+        alpha_grad->dtype(), dim, alpha_grad->layout());
+    alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta));
+    alpha_grad_tmp_ptr = alpha_grad_tmp.data<T>();
+  }
+
+  PRELU_MODE m;
+  bool channel_last = false;
+  if (mode == "element") {
+    m = Element;
+  } else if (mode == "channel") {
+    channel_last = data_format == "NHWC";
+    m = channel_last ? ChannelLast : ChannelFirst;
+  } else {
+    m = PRELU_Scalar;
+  }
+  PreluOpGradFunctor<T> prelu_grad;
+  prelu_grad(stream,
+             x_ptr,
+             alpha_ptr,
+             out_grad_ptr,
+             x_grad_ptr,
+             alpha_grad_tmp_ptr,
+             dim,
+             m);
+
+  if (alpha_grad_tmp_ptr == nullptr) return;
+
+  std::vector<int> reduce_dims;
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (mode == "channel" && !channel_last && i == 1) continue;
+    if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
+    if (mode == "element" && i != 0) continue;
+    reduce_dims.push_back(i);
+  }
+
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      static_cast<const phi::GPUContext&>(dev_ctx),
+      alpha_grad_tmp,
+      alpha_grad,
+      kps::IdentityFunctor<T>(),
+      reduce_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8255a7ba2ed96dcdeb8d6e23a4637ce56d636a12
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  const T* alpha_ptr = alpha.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+
+  VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
+          << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
+
+  if (mode == "channel") {
+    bool channel_last = data_format == "NHWC";
+    size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
+    PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
+    prelu_channel_wise(dev_ctx.stream(),
+                       x_ptr,
+                       alpha_ptr,
+                       o_ptr,
+                       dim[0],
+                       channel,
+                       channel_last,
+                       numel);
+  } else if (mode == "element") {
+    PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
+    prelu_element_wise(
+        dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel);
+  } else {
+    PreluScalarDirectCUDAFunctor<T> prelu_scalar;
+    prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6745653eba7d175447eb54c319919fd6d87fb5dd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(const int nthreads,
+                                     const T* input_rois,
+                                     const T* dout_data,
+                                     const float spatial_scale,
+                                     const int input_channels,
+                                     const int height,
+                                     const int width,
+                                     const int output_channels,
+                                     const int pooled_height,
+                                     const int pooled_width,
+                                     const int* rois_batch_id_data,
+                                     T* dx_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_dx_data = dx_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  int rois_num_t = rois.dims()[0];
+  int input_channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (dx) {
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      paddle::memory::Copy(CPUPlace(),
+                           rois_num_list.data(),
+                           ctx.GetPlace(),
+                           rois_num->data<int>(),
+                           sizeof(int) * rois_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    DenseTensor rois_batch_id_list_gpu;
+    Copy(ctx,
+         rois_batch_id_list,
+         ctx.GetPlace(),
+         false,
+         &rois_batch_id_list_gpu);
+
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    int dout_size = dout.numel();
+    int blocks = NumBlocks(dout_size);
+    int threads = kNumCUDAThreads;
+
+    if (dout_size > 0) {
+      GPUPSROIPoolBackward<T><<<blocks, threads, 0, ctx.stream()>>>(
+          dout_size,
+          rois.data<T>(),
+          dout.data<T>(),
+          spatial_scale,
+          input_channels,
+          height,
+          width,
+          output_channels,
+          pooled_height,
+          pooled_width,
+          rois_batch_id_list_gpu.data<int>(),
+          ctx.template Alloc<T>(dx));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f9be001ba763d323ad93fdfd4cc06e97e266188
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(const int nthreads,
+                                    const T* input_data,
+                                    const T* input_rois,
+                                    const float spatial_scale,
+                                    const int input_channels,
+                                    const int height,
+                                    const int width,
+                                    const int output_channels,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int* rois_batch_id_data,
+                                    T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "The channels %d of input X should equal the product of "
+          "output_channels %d x pooled_height %d x pooled_width %d.",
+          input_channels,
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  int rois_num_t = rois.dims()[0];
+  if (rois_num_t == 0) return;
+  int rois_batch_size;
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num_data,
+                         sizeof(int) * rois_batch_size,
+                         0);
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_list[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_list[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_list[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_t,
+                      rois_num_with_lod,
+                      errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          rois_num_with_lod));
+
+    // set rois batch id
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  DenseTensor rois_batch_id_list_gpu;
+  Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu);
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  // call cuda kernel function
+  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      rois.data<T>(),
+      spatial_scale,
+      input_channels,
+      height,
+      width,
+      output_channels,
+      pooled_height,
+      pooled_width,
+      rois_batch_id_list_gpu.data<int>(),
+      ctx.template Alloc<T>(out));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index d4d90cac917a2c35e26eca0d57d1c5349b878599..92948bf47c9345e53d1fea54d2be4fd8efbc6f96 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -14,37 +14,161 @@
 
 #include "paddle/phi/kernels/randperm_kernel.h"
 
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/randint_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
+template <typename T>
+__global__ void SwapRepeatKernel(
+    int* key, T* data, int n, uint64_t seed, uint64_t offset) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < n) return;
+
+  bool first_repeat = false;
+  if (data[idx] == data[idx + 1]) {
+    if (idx == 0) {
+      first_repeat = true;
+    } else if (data[idx] != data[idx - 1]) {
+      first_repeat = true;
+    }
+  }
+
+  if (!first_repeat) return;
+
+  int repeat_size = 1;
+  for (int i = idx; i < n; ++i) {
+    if (data[i] == data[i + 1]) {
+      ++repeat_size;
+    } else {
+      break;
+    }
+  }
+
+#ifdef __NVCC__
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = curand(&state) % (i + 1);
+#elif __HIPCC__
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = hiprand(&state) % (i + 1);
+#endif
+    if (r != i) {
+      T tmp = data[idx + i];
+      data[idx + i] = data[idx + r];
+      data[idx + r] = tmp;
+    }
+  }
+}
+
 template <typename T, typename Context>
 void RandpermRawKernel(
     const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim({n}));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    DenseTensor key;
+    RandintKernel<int, Context>(dev_ctx,
+                                std::numeric_limits<int>::min(),
+                                std::numeric_limits<int>::max(),
+                                ScalarArray({n}),
+                                phi::DataType::INT32,
+                                &key);
+    DenseTensor key_out = Empty<int, Context>(dev_ctx, ScalarArray({n}));
+
+    DenseTensor range = Empty<T, Context>(dev_ctx, ScalarArray({n}));
+    T* range_data = range.data<T>();
+    funcs::ForRange<Context> for_range(dev_ctx, n);
+    for_range([range_data] __device__(size_t idx) {
+      range_data[idx] = static_cast<T>(idx);
+    });
+
+    out->Resize(phi::make_ddim({n}));
+    T* out_data = dev_ctx.template Alloc<T>(out);
+
+    // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
+    // improve performance of radix sort.
+    double n_d = static_cast<double>(n);
+    int begin_bit = 0;
+    int end_bit =
+        std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+    cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto gen_cuda = dev_ctx.GetGenerator();
+    auto seed_offset = gen_cuda->IncrementOffset(n);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+    SwapRepeatKernel<T><<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          0,
+                          dev_ctx.stream()>>>(
+        key_out.data<int>(), out_data, n, seed, offset);
   } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim({n}));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  for (int i = 0; i < n; ++i) {
-    tmp_data[i] = static_cast<T>(i);
-  }
-  std::shuffle(tmp_data, tmp_data + n, *engine);
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }
 
-  T* out_data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+    for (int i = 0; i < n; ++i) {
+      tmp_data[i] = static_cast<T>(i);
+    }
+    std::shuffle(tmp_data, tmp_data + n, *engine);
+
+    T* out_data = dev_ctx.template Alloc<T>(out);
+    auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(), out_data, tmp.place(), tmp_data, size, 0);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index d21c8a3fa46f81c046c722db50ac62fb57cf64f4..e32101b73728f637da0626d691018842aedd62e7 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx,
       }));
 }
 
+template <typename T,
+          typename Context,
+          template <typename, typename> class TransformOp>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, TransformOp<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      TransformOp<T, MPType>(reduce_num));
+}
+
 }  // namespace phi
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5256048267ea19a4cb12387ebbc582a2df1bd1b1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::IdentityFunctor>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::DivideFunctor>(dev_ctx,
+                                                   x,
+                                                   out_grad,
+                                                   dims,
+                                                   keep_dim,
+                                                   reduce_all,
+                                                   in_dtype,
+                                                   out_dtype,
+                                                   x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6cbe699e8e05831b049536b06b1fdadcc145537d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+
+PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
index 98c3986c51dd6829287f5316ae9eb52f328372ab..ddbc08b06c84b0afe42091ddf9a53a928621ef6d 100644
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
deleted file mode 100644
index 9f4ddc3cf37a744355f6f79b7cd18b3d06b80062..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/gpu/reduce_grad.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceSumGradKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
-                         const std::vector<int64_t>& dims,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType in_dtype,
-                         DataType out_dtype,
-                         DenseTensor* x_grad) {
-  auto* in_x = &x;
-  auto* d_out = &out_grad;
-  auto* d_x = x_grad;
-
-  auto pt_out_dtype = in_dtype;
-
-  // get reduce_dim and reduce_num for reduce_mean_grad
-  int dim_size = in_x->dims().size();
-  std::vector<int> reduce_dims =
-      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
-
-  auto update_dims = vectorize(d_x->dims());
-  int reduce_num = 1;
-  for (auto i : reduce_dims) {
-    reduce_num *= (in_x->dims())[i];
-    update_dims[i] = 1;
-  }
-  // make new tensor
-  DenseTensor new_d_out(d_out->dtype());
-  new_d_out.ShareDataWith(*d_out);
-  new_d_out.Resize(phi::make_ddim(update_dims));
-  if (in_dtype != DataType::UNDEFINED) {
-    dev_ctx.Alloc(d_x, in_dtype);
-  } else {
-    dev_ctx.Alloc(d_x, d_out->dtype());
-  }
-
-  auto pt_d_out = new_d_out;
-  auto pt_d_x = *d_x;
-  if (in_dtype == DataType::UNDEFINED) {
-    pt_out_dtype = d_out->dtype();
-  }
-  using MPType = typename kps::details::MPTypeTrait<T>::Type;
-
-  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
-      dev_ctx,
-      &pt_d_out,
-      &pt_d_x,
-      pt_out_dtype,
-      kps::IdentityFunctor<T, MPType>(reduce_num));
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sum_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ReduceSumGradKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf076128b69396196f59a8accd0c282322f8f49a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ void BilinearInterpolateGradient(const int height,
+                                            const int width,
+                                            T y,
+                                            T x,
+                                            T* w1,
+                                            T* w2,
+                                            T* w3,
+                                            T* w4,
+                                            int* x_low,
+                                            int* x_high,
+                                            int* y_low,
+                                            int* y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return;
+  }
+
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low, lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void GPURoiAlignBackward(const int nthreads,
+                                    const T* input_rois,
+                                    const T* out_grad,
+                                    const int num_rois,
+                                    const float spatial_scale,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data,
+                                    T* input_grad,
+                                    const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_input_grad =
+        input_grad + (roi_batch_ind * channels + c) * height * width;
+
+    const T* offset_out_grad =
+        out_grad + (n * channels + c) * pooled_height * pooled_width;
+    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
+        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
+        BilinearInterpolateGradient(height,
+                                    width,
+                                    y,
+                                    x,
+                                    &w1,
+                                    &w2,
+                                    &w3,
+                                    &w4,
+                                    &x_low,
+                                    &x_high,
+                                    &y_low,
+                                    &y_high);
+        T diff1 = out_grad_this_bin * w1 / count;
+        T diff2 = out_grad_this_bin * w2 / count;
+        T diff3 = out_grad_this_bin * w3 / count;
+        T diff4 = out_grad_this_bin * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_low, diff1);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_high, diff2);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_low, diff3);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_high, diff4);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  int rois_num = boxes.dims()[0];
+  int channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_size[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_size[i] = n;
+      }
+    }
+  }
+  auto roi_ptr =
+      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+  int blocks = NumBlocks(output_grad_size);
+  int threads = kNumCUDAThreads;
+
+  if (output_grad_size > 0) {
+    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        output_grad_size,
+        boxes.data<T>(),
+        out_grad.data<T>(),
+        rois_num,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        roi_id_data,
+        dx->data<T>(),
+        aligned);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb3375dee95a5992fd598fdc8ba4f5e176f357a2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -0,0 +1,254 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ T BilinearInterpolate(
+    const T* input_data, const int height, const int width, T y, T x) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <class T>
+__global__ void GPURoiAlignForward(const int nthreads,
+                                   const T* input_data,
+                                   const T* input_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   const int sampling_ratio,
+                                   int* roi_batch_id_data,
+                                   T* output_data,
+                                   const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+    output_data[i] = output_val;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+#ifdef WITH_NV_JETSON
+  backends::gpu::ChangeThreadNum(dev_ctx, &threads, 256);
+#endif
+  DenseTensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  int* roi_batch_id_data = dev_ctx.template HostAlloc<int>(&roi_batch_id_list);
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(lod.empty(),
+                      false,
+                      errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
+                                              "not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and batch size "
+            "of images must be the same. But received rois batch size = %d, "
+            "and images batch size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  int bytes = roi_batch_id_list.numel() * sizeof(int);
+  auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
+  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      roi_id_data,
+      dev_ctx.template Alloc<T>(out),
+      aligned);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d093a71d23f4ea96f9d7e7de11dcfefade3788ee
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolBackward(const int nthreads,
+                                   const T* input_rois,
+                                   const T* output_grad,
+                                   const int64_t* arg_max_data,
+                                   const int num_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   int* box_batch_id_data,
+                                   T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = box_batch_id_data[n];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_arg_max_data = arg_max_data + output_offset;
+
+    int arg_max = offset_arg_max_data[ph * pooled_width + pw];
+    if (arg_max != -1) {
+      paddle::platform::CudaAtomicAdd(
+          offset_input_grad + arg_max,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  auto x_dims = x.dims();
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (dx) {
+    DenseTensor box_batch_id_list;
+    box_batch_id_list.Resize({rois_num});
+    int* box_batch_id_data =
+        dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+    auto gplace = dev_ctx.GetPlace();
+    if (boxes_num) {
+      int boxes_batch_size = boxes_num->numel();
+      std::vector<int> boxes_num_list(boxes_batch_size);
+      paddle::memory::Copy(phi::CPUPlace(),
+                           boxes_num_list.data(),
+                           gplace,
+                           boxes_num->data<int>(),
+                           sizeof(int) * boxes_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_list[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_list[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      int boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+    int bytes = box_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    paddle::memory::Copy(gplace,
+                         roi_id_data,
+                         phi::CPUPlace(),
+                         box_batch_id_data,
+                         bytes,
+                         dev_ctx.stream());
+
+    dev_ctx.template Alloc<T>(dx);
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    int output_grad_size = out_grad.numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output_grad_size,
+          boxes.data<T>(),
+          out_grad.data<T>(),
+          arg_max.data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_id_data,
+          dx->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab33e2cf64751f1cd5be44fc6f759acffd2fb93d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolForward(const int nthreads,
+                                  const T* input_data,
+                                  const T* input_rois,
+                                  const float spatial_scale,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  int* box_batch_id_data,
+                                  T* output_data,
+                                  int64_t* arg_max_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int box_batch_ind = box_batch_id_data[n];
+    int box_start_w = round(offset_input_rois[0] * spatial_scale);
+    int box_start_h = round(offset_input_rois[1] * spatial_scale);
+    int box_end_w = round(offset_input_rois[2] * spatial_scale);
+    int box_end_h = round(offset_input_rois[3] * spatial_scale);
+
+    int box_width = max(box_end_w - box_start_w + 1, 1);
+    int box_height = max(box_end_h - box_start_h + 1, 1);
+
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(box_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(box_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(box_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(box_width) /
+                                     static_cast<double>(pooled_width)));
+    hstart = min(max(hstart + box_start_h, 0), height);
+    hend = min(max(hend + box_start_h, 0), height);
+    wstart = min(max(wstart + box_start_w, 0), width);
+    wend = min(max(wend + box_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (box_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[i] = maxval;
+    if (arg_max_data) {
+      arg_max_data[i] = maxidx;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  auto in_stride = phi::stride(x_dims);
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_id_data = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+  auto gplace = dev_ctx.GetPlace();
+
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num,
+                      boxes_num_with_lod,
+                      phi::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
+  paddle::memory::Copy(gplace,
+                       box_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      box_id_data,
+      output_data,
+      arg_max_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..93e9e81882c9e6eacd5f9ee91fa7541495ef2663
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  auto* in_data = out_grad.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(x_grad);
+  int64_t numel = out_grad.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = out_grad.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1543335d3a0c5884d6b82394253bb4e8dda8cef0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+
+  size_t nums = shifts_data.size();
+  auto input_dim = x.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+
+      if (size != 0) {
+        shifts_data[i] = (shifts_data[i] % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..abe3ee470b4bc6b3951e1ad2da09544e319cbcac
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, size_t Rank>
+__global__ void RollCudaKernel(const T* input,
+                               T* output,
+                               int64_t N,
+                               phi::Array<int64_t, Rank> shifts,
+                               phi::Array<int64_t, Rank> strides,
+                               phi::Array<int64_t, Rank> sizes) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t output_idx = idx;
+  int64_t new_dim_idx = 0;
+
+#pragma unroll
+  for (size_t i = 0; i < Rank; i++) {
+    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    if (new_dim_idx >= sizes[i]) {
+      output_idx += (shifts[i] - sizes[i]) * strides[i];
+    } else {
+      output_idx += shifts[i] * strides[i];
+    }
+  }
+  output[output_idx] = input[idx];
+}
+
+#define CALL_ROLL_CUDA_KERNEL(N)                                              \
+  case N: {                                                                   \
+    phi::Array<int64_t, N> _strides;                                          \
+    phi::Array<int64_t, N> _shifts;                                           \
+    phi::Array<int64_t, N> _sizes;                                            \
+    for (size_t idx = 0; idx < N; ++idx) {                                    \
+      _strides[idx] = strides[idx];                                           \
+      _shifts[idx] = shifts_data[idx];                                        \
+      _sizes[idx] = sizes[idx];                                               \
+    }                                                                         \
+    RollCudaKernel<                                                           \
+        T,                                                                    \
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+             PADDLE_CUDA_NUM_THREADS,                                         \
+             0,                                                               \
+             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                    \
+  }
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 930c50a24be8fae40535c2d5e6dbbe85e7ced990..6f96a697b2f2db6c2097640f34c30142939f80e0 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/scale_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/phi/common/float16.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
similarity index 55%
rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu
rename to paddle/phi/kernels/gpu/searchsorted_kernel.cu
index 14084d0f4f3c6fbd4edeb335e15704ce2b4e6e15..4a2ce2241c22dc5c1cab391fe24a502ba845802b 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -12,31 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/phi/kernels/gpu/reduce.h"
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
-
-namespace phi {
+#include "paddle/phi/kernels/searchsorted_kernel.h"
 
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
 
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(searchsorted,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::SearchsortedKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index d9618dc159a6d3f5b24bdfcfdb219ec649e051f9..9d1769e18b4b809fbc353513a05553e0ccd97572 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index c38e935adf837ef00c48fa31bc1e37eea2948673..3128e534166acba6ca136331ad8efea66b18621f 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -19,5 +19,11 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7eed96699e720870577c3d5246ce07c12c37335c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 03c5714b967841ef1bd124bd9191830a79567514..4a02f438c7e7e4f0c0212ee613ce78a7fac20909 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 PD_REGISTER_KERNEL(softmax,
                    GPU,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawKernel,
+                   phi::SoftmaxKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index c28fc3794f092a4cee8d7fc351190c13291892b1..83c2ec4b6e99d675bfbcab58abd265cc8595259c 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -37,7 +37,7 @@ void SplitKernel(const Context& dev_ctx,
       out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 4e9aa88c6cb2da7fabe3f5d841a313e82b9ebed2..7f06af7de43f7ee234831203c485eaa0b8c86cbf 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -78,15 +78,16 @@ void TopkKernel(const Context& dev_ctx,
     // The conclusion is drawn from the data through multiple sets of
     // statistics
     if (input_width >= 128 && k >= input_width * 0.75) {
-      if (ops::SortTopk<T>(
-              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
-              input,
-              input_width,
-              input_height,
-              k,
-              out,
-              indices,
-              largest)) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           input,
+                           input_width,
+                           input_height,
+                           k,
+                           out,
+                           indices,
+                           largest)) {
         // Successed, return.
         return;
       } else {
@@ -181,15 +182,16 @@ void TopkKernel(const Context& dev_ctx,
     // The conclusion is drawn from the data through multiple sets of
     // statistics
     if (input_width >= 128 && k >= input_width * 0.75) {
-      if (ops::SortTopk<T>(
-              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
-              &trans_input,
-              input_width,
-              input_height,
-              k,
-              &trans_out,
-              &trans_ind,
-              largest)) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           &trans_input,
+                           input_width,
+                           input_height,
+                           k,
+                           &trans_out,
+                           &trans_ind,
+                           largest)) {
         // last step, tranpose back the indices and output
         funcs::TransCompute<phi::GPUContext, int64_t>(
             ndims, dev_ctx, trans_ind, indices, trans);
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bc3ef1bc623bb27ac2452d1e908c389543598011
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c48edf9eff25aa68abcfe0b08dd7ab659aaa0fb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 535cb812a20ea90bdb3f07b731af52c2822f0ec2..9538533f70d597e21b393d2650d56bebd823c360 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,150 +20,59 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/kernels/where_index_kernel.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"
 
 namespace phi {
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data,
-                           const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+template <typename T1, typename T2, typename OutT>
+struct IndexFunctor {
+  T2 stride[phi::DDim::kMaxRank];
+  int dims;
+  explicit IndexFunctor(const phi::DDim &in_dims) {
+    dims = in_dims.size();
+    std::vector<T2> strides_in_tmp;
+    strides_in_tmp.resize(dims, 1);
+    // get strides according to in_dims
+    for (T2 i = 1; i < dims; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
+    }
+    memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
   }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr,
-                             const T *cond_data,
-                             const int64_t numel,
-                             const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
+  HOSTDEVICE inline void operator()(OutT *out,
+                                    const T1 *mask,
+                                    const T2 *index,
+                                    const int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        T2 data_index = index[idx];
+        // get index
+        for (int rank_id = dims - 1; rank_id >= 0; --rank_id) {
+          out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]);
+          data_index = data_index % stride[rank_id];
+          store_fix++;
+        }
       }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void WhereIndexKernel(const Context &dev_ctx,
                       const DenseTensor &condition,
                       DenseTensor *out) {
-  const T *cond_data = condition.data<T>();
-  const int64_t numel = condition.numel();
+  DenseTensor in_data;
   auto dims = condition.dims();
-  const int rank = dims.size();
-
-  auto d_array_mem =
-      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-  auto h_array_mem =
-      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-  // "stride_array" is an array and len(stride_array)==rank,
-  // each element is the stride of each dimension -- the length from i to i+1.
-  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-  // "true_num_array" is an array and len(stride_array)==numel,
-  // at the beginning,
-  // "true_num_array" will set 1 if condition[i] == true else 0,
-  // then it will be calculated by cub::InclusiveSum,
-  // so that we can get the true number before i as the out index
-  int64_t *d_true_num_array = d_stride_array + rank;
-
-  // the total_true_num is the total number of condition[i] == true
-  int64_t *h_total_true_num = h_stride_array + rank;
-
-  // alloce cub memory
-  size_t cub_size = 0;
-  cub::DeviceScan::InclusiveSum(nullptr,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-  void *cub_data = cub_mem->ptr();
-
-  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-  const int threads = std::min(numel, static_cast<int64_t>(128));
-  const int64_t need_grids = (numel + threads - 1) / threads;
-  const int grids = std::min(need_grids, static_cast<int64_t>(256));
-  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      cond_data, numel, d_true_num_array);
-
-  // calculate the inclusive prefix sum of "true_num_array"
-  // to get the index of "out" tensor,
-  // and the total number of cond_data[i]==true.
-  // Example:
-  // condition: F T T F F F T T
-  // before:    0 1 1 0 0 0 1 1
-  // after:     0 1 2 2 2 2 3 4
-  // out:       1 2 6 7
-  cub::DeviceScan::InclusiveSum(cub_data,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-
-  // calculate each dimension's stride
-  h_stride_array[rank - 1] = 1;
-  for (int i = rank - 2; i >= 0; i--) {
-    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_stride_array,
-                       phi::CPUPlace(),
-                       h_stride_array,
-                       rank * sizeof(int64_t),
-                       dev_ctx.stream());
-
-  // get total ture number and set output size
-  // the last element of cub::InclusiveSum is the total number
-  paddle::memory::Copy(phi::CPUPlace(),
-                       h_total_true_num,
-                       dev_ctx.GetPlace(),
-                       d_true_num_array + numel - 1,
-                       sizeof(int64_t),
-                       dev_ctx.stream());
-  dev_ctx.Wait();
-
-  int64_t true_num = *h_total_true_num;
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
-
-  if (true_num == 0) {
-    return;
-  }
-
-  // using true_num_array and stride_array to calculate the output index
-  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+  using Functor = IndexFunctor<T, int64_t, int64_t>;
+  Functor index_functor = Functor(dims);
+  phi::funcs::SelectKernel<T, T, int64_t, 0, Functor>(
+      dev_ctx, condition, in_data, out, index_functor);
 }
-
 }  // namespace phi
 
 PD_REGISTER_KERNEL(where_index,
diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cf2c991464fc6e091eee0bc75641d7abae8598c
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+using PoolingMode = paddle::platform::PoolingMode;
+using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor;
+using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor;
+
+template <typename T>
+using ScalingParamType =
+    typename paddle::platform::CudnnDataType<T>::ScalingParamType;
+
+inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) {
+  if (data_format == "NHWC") {
+    return GPUDNNDataLayout::kNHWC;
+  } else if (data_format == "NCHW") {
+    return GPUDNNDataLayout::kNCHW;
+  } else if (data_format == "NCDHW") {
+    return GPUDNNDataLayout::kNCDHW;
+  } else {
+    return GPUDNNDataLayout::kNCDHW;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b731d03347024ccd76eafc02c7096f3633948eb5
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -0,0 +1,448 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"  //  PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawGPUDNNKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int>& kernel_size,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             bool exclusive,
+                             const std::string& data_format,
+                             const std::string& pooling_type,
+                             bool global_pooling,
+                             bool adaptive,
+                             const std::string& padding_algorithm,
+                             DenseTensor* dx) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  const DenseTensor* output = &out;
+  const DenseTensor* output_grad = &dout;
+  DenseTensor* input_grad = dx;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+#ifdef PADDLE_WITH_HIP
+  if (pooling_type == "max") {
+    PoolGradRawKernel<T, GPUContext>(ctx,
+                                     x,
+                                     out,
+                                     dout,
+                                     kernel_size,
+                                     strides,
+                                     paddings_,
+                                     exclusive,
+                                     data_format,
+                                     pooling_type,
+                                     global_pooling,
+                                     adaptive,
+                                     padding_algorithm,
+                                     dx);
+    return;
+  }
+#endif
+
+  // update paddings
+  auto in_x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  // ------- tensor grad --------------
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_output_grad(output_grad->type());
+
+  ctx.template Alloc<T>(input_grad);
+  DenseTensor transformed_input_grad(input_grad->type());
+  GPUDNNDataLayout layout;
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v3;
+    trans5_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans4;
+    trans4(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v2;
+    trans4_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v3;
+    trans4_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+    transformed_output_grad = *output_grad;
+    transformed_input_grad = *input_grad;
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_data = transformed_output.data<T>();
+  const T* output_grad_data = transformed_output_grad.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    if (FLAGS_cudnn_deterministic) {
+      pooling_mode = PoolingMode::kMaximumDeterministic;
+    } else {
+      pooling_mode = PoolingMode::kMaximum;
+    }
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+  if (input_grad) {
+    T* input_grad_data = ctx.template Alloc<T>(&transformed_input_grad);
+// Because beta is zero, it is unnecessary to reset input_grad.
+#ifdef PADDLE_WITH_HIP
+    char* pool_workspace;
+    size_t pool_worksize = 0;
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+        cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle,
+                                                              cudnn_pool_desc,
+                                                              &alpha,
+                                                              cudnn_output_desc,
+                                                              output_data,
+                                                              cudnn_output_desc,
+                                                              output_grad_data,
+                                                              cudnn_input_desc,
+                                                              input_data,
+                                                              &beta,
+                                                              cudnn_input_desc,
+                                                              input_grad_data,
+                                                              pool_workspace));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
+                                                             cudnn_pool_desc,
+                                                             &alpha,
+                                                             cudnn_output_desc,
+                                                             output_data,
+                                                             cudnn_output_desc,
+                                                             output_grad_data,
+                                                             cudnn_input_desc,
+                                                             input_data,
+                                                             &beta,
+                                                             cudnn_input_desc,
+                                                             input_grad_data));
+#endif
+
+    if (data_format == str_NDHWC) {
+      std::vector<int> axis{0, 2, 3, 4, 1};
+      funcs::Transpose<Context, T, 5> trans5_v4;
+      trans5_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+    if (data_format == str_NHWC) {
+      std::vector<int> axis{0, 2, 3, 1};
+      funcs::Transpose<Context, T, 4> trans4_v4;
+      trans4_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#endif
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dGPUDNNKernel<T, Context>(ctx,
+                                   x,
+                                   kernel_size,
+                                   strides,
+                                   paddings,
+                                   ceil_mode,
+                                   exclusive,
+                                   data_format,
+                                   pooling_type,
+                                   global_pooling,
+                                   adaptive,
+                                   padding_algorithm,
+                                   out);
+  }
+}
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d8f965667758b9118635e3c8db4be74f9ff54a6a
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -0,0 +1,312 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolRawGPUDNNKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int>& kernel_size,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         bool exclusive,
+                         const std::string& data_format,
+                         const std::string& pooling_type,
+                         bool global_pooling,
+                         bool adaptive,
+                         const std::string& padding_algorithm,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  ctx.template Alloc<T>(output);
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings_
+  auto x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+
+  // -----------------transformed tensor ------------------------
+
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  GPUDNNDataLayout layout;
+
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, *input, &transformed_input, axis);
+
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+  }
+
+  const T* tranformed_input_data = transformed_input.data<T>();
+  T* tranformed_output_data = ctx.template Alloc<T>(&transformed_output);
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    pooling_mode = PoolingMode::kMaximum;
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+
+#ifdef PADDLE_WITH_HIP
+  char* pool_workspace;
+  size_t pool_workernel_size_ = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+      cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenPoolingForward(handle,
+                                    cudnn_pool_desc,
+                                    &alpha,
+                                    cudnn_input_desc,
+                                    tranformed_input_data,
+                                    &beta,
+                                    cudnn_output_desc,
+                                    tranformed_output_data,
+                                    false,
+                                    pool_workspace,
+                                    pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cudnnPoolingForward(handle,
+                                   cudnn_pool_desc,
+                                   &alpha,
+                                   cudnn_input_desc,
+                                   tranformed_input_data,
+                                   &beta,
+                                   cudnn_output_desc,
+                                   tranformed_output_data));
+#endif
+  // add
+  if (data_format == str_NDHWC) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, transformed_output, output, axis);
+  }
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN not support NHWC data layout
+  if (data_format == str_NHWC) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, transformed_output, output, axis);
+  }
+#endif
+}
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
+PD_REGISTER_KERNEL(
+    pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 0352fdf6fa2f1c1b74515d8e0023ef5a58e4efae..77159bfc876da603f703a13592f525d808adfbbf 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -121,17 +121,10 @@ struct ReduceMaxFunctor {
 };
 
 template <typename Tx, typename Ty = Tx>
-struct ExpSubFunctor {
-  HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
-
-  HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
-
+struct ExpFunctor {
   HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(std::exp(x - y));
+    return static_cast<Ty>(std::exp(x));
   }
-
- private:
-  Tx y;
 };
 
 template <typename Tx, typename Ty = Tx>
@@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
   }
 
   // data src
-  AccT srcdata[kBatchSize][kLoopsV][kVSize];
-  T src_tmp[kBatchSize][kLoopsV][kVSize];
-  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
-  kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
+  // src_data: the raw data form global memory
+  // sub_data: store the data obtained by (src_data - max), used by log_softmax
+  // exp_data: store the data obtained by (exp(sub_data)), used by softmax
+  T src_data[kBatchSize][kLoopsV][kVSize];
+  AccT sub_data[kBatchSize][kLoopsV][kVSize];
+  AccT exp_data[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&sub_data[0][0][0], kLowInf);
+  kps::Init<T, kStep>(&src_data[0][0][0], -std::numeric_limits<T>::infinity());
 
   // data dst
   T out_tmp[kBatchSize][kLoopsV][kVSize];
@@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
   for (int i = 0; i < kBatchSize; ++i) {
     const VecT* src_v =
         reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-    VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
     kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
         &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
     kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
-        &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
+        &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
   }
 
   // compute max
@@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               ReduceMaxFunctor<AccT>,
               kMode::kLocalMode>(
-      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
+      &max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
-        &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+        &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
+        &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
   }
   kps::Reduce<AccT,
               kVItem,
@@ -342,7 +341,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               kps::AddFunctor<AccT>,
               kMode::kLocalMode>(
-      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
+      &sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -351,8 +350,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
     VecT* softmax_v =
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &sub_data[i][0][0],
+          UnarySubFunctor<AccT>(std::log(sum[i])));
+    } else {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+          &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
@@ -434,15 +440,25 @@ __global__ void WarpSoftmaxBackward(T* dst,
   AccT sum_tmp[kBatchSize][kLoopsV][kVSize];
   AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[0][0][0]);
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
-  kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
-      &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT,
-              kVItem,
-              kBatchSize,
-              1,
-              kps::AddFunctor<AccT>,
-              kps::details::ReduceMode::kLocalMode>(
-      &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  if (LogMode) {
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  } else {
+    kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+        &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  }
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // write result to global memory
@@ -453,10 +469,23 @@ __global__ void WarpSoftmaxBackward(T* dst,
     if (i >= local_batches) break;
     AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
     AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
-        &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
-    kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor<AccT>());
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
+          &out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &gradptr[0],
+          &out[i][0][0],
+          kps::SubFunctor<AccT>());
+    } else {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcptr[0],
+          &out[i][0][0],
+          kps::MulFunctor<AccT>());
+    }
     VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
@@ -639,7 +668,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename> class Functor,
+          bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
                                       const T* output,
@@ -656,10 +686,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
 
       // 1. reduce sum
       AccT sum = 0;
-      for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
-        int data_offset = grad_offset + mid_id * mid_stride;
-        sum += static_cast<AccT>(output_grad[data_offset]) *
-               static_cast<AccT>(output[data_offset]);
+      if (LogMode) {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]);
+        }
+      } else {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]) *
+                 static_cast<AccT>(output[data_offset]);
+        }
       }
       if (blockDim.y > 1) {
         kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
@@ -715,10 +752,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          LogSoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -726,10 +763,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
         mid_dim,
         low_dim);
   } else {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          SoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -864,6 +901,32 @@ static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
   return false;
 }
 
+#if CUDNN_VERSION < 8100
+template <>
+inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* out) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+template <>
+inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& out,
+    const DenseTensor& dout,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* dx) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+#endif
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 7685c7dbb6894b4e640ea4b63010c4d22fc5e18f..37175c427ffe142c31b41c8356d160d203fd6d73 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -21,10 +21,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            int axis,
-                            DenseTensor* out) {
+void SoftmaxGPUDNNKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int axis,
+                         DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
   SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
 }
@@ -35,7 +35,7 @@ void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
@@ -44,7 +44,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16,
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/grid_sample_grad_kernel.h b/paddle/phi/kernels/grid_sample_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..50a8d5be260bd387476467e2cdddaeb59f943b9b
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context &dev_ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &grid,
+                          const DenseTensor &out_grid,
+                          const std::string &mode,
+                          const std::string &padding_mode,
+                          bool align_corners,
+                          DenseTensor *x_grad,
+                          DenseTensor *grid_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e1e9b508649b22de086a103537f4984b7f693e5
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      const DenseTensor &grid,
+                      const std::string &mode,
+                      const std::string &padding_mode,
+                      bool align_corners,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7a327cd3f566d7d3e3da9517ba2f50d67b6ba60
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..619b022904b17b3669abe61dc5ce341f6c6ae9bc
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 80e23d2b8e24b875fcc03bc0c1c149c0c13e3e41..7d6b6dc72ea60214ff4c9974b4ff885feecb5822 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -130,4 +130,149 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
       relu_double_grad_functor);
 }
 
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout) {
+  funcs::LeakyReluGradGradFunctor<T> leaky_relu_double_grad_functor;
+  leaky_relu_double_grad_functor.alpha = alpha;
+  ActivationDoubleGradImpl<T, Context, funcs::LeakyReluGradGradFunctor<T>>(
+      dev_ctx,
+      &x,
+      nullptr,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      leaky_relu_double_grad_functor);
+}
+
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::TanhGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::TanhTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,  // input
+          d_dout,
+          d_out_new,
+          d_ddx);  // output
+}
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::ELUGradGradFunctor<T> functor;
+  functor.alpha = alpha;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::SigmoidGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::SigmoidTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,
+          d_dout,
+          d_out_new,
+          d_ddx);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 9f557e746378939e32a32955e758cdc5c510f229..e3ea10705d24e90a76246d439c6d9263e072bc39 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,18 +19,17 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx,
   const auto H = y_bst_dims_vec[y_bst_ndim - 2];
   const auto W = y_bst_dims_vec[y_bst_ndim - 1];
   phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
-  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+  phi::funcs::TrilTriuCompute<T> tril_triu_functor(
       dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
   y_for_range(tril_triu_functor);
 
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8795808a643d2741ca210b13303febd187a193a
--- /dev/null
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T>
+HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
+                                const int data_width,
+                                const int height,
+                                const int width,
+                                T h,
+                                T w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
+
+  T v1 =
+      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
+  T v2 = (h_low >= 0 && w_high <= width - 1)
+             ? bottom_data[h_low * data_width + w_high]
+             : 0;
+  T v3 = (h_high <= height - 1 && w_low >= 0)
+             ? bottom_data[h_high * data_width + w_low]
+             : 0;
+  T v4 = (h_high <= height - 1 && w_high <= width - 1)
+             ? bottom_data[h_high * data_width + w_high]
+             : 0;
+
+  T w1 = hh * hw;
+  T w2 = hh * lw;
+  T w3 = lh * hw;
+  T w4 = lh * lw;
+
+  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col);
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out) {
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
+
+  // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
+  std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
+  col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
+  col_buffer_shape_vec[1] = im2col_step;
+  for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
+    col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
+  }
+
+  std::vector<int64_t> output_buffer_shape_vec(1);
+  output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
+                               output_shape_vec[2] * output_shape_vec[3];
+
+  DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
+  DenseTensor output_buffer = Empty<T>(dev_ctx, output_buffer_shape_vec);
+
+  int64_t M = output_shape_vec[1] / groups;
+  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
+  int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
+
+  DenseTensor weight_3d;
+  weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
+
+  DenseTensor col_buffer_3d;
+  col_buffer_3d.ShareDataWith(col_buffer)
+      .Resize(phi::make_ddim({groups, K, N}));
+
+  DenseTensor output_4d;
+  output_4d.ShareDataWith(output_buffer)
+      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
+
+  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
+  std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
+
+  int input_dim = x.numel() / x.dims()[0];
+  int input_offset_dim = offset.numel() / offset.dims()[0];
+  int input_mask_dim = mask.numel() / mask.dims()[0];
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  const T* input_ptr = x.data<T>();
+  const T* offset_ptr = offset.data<T>();
+  const T* mask_ptr = mask.data<T>();
+  T* col_buffer_ptr = col_buffer.data<T>();
+
+  for (int i = 0; i < batch_size / im2col_step; ++i) {
+    ModulatedDeformableIm2col(dev_ctx,
+                              input_ptr + i * im2col_step * input_dim,
+                              offset_ptr + i * im2col_step * input_offset_dim,
+                              mask_ptr + i * im2col_step * input_mask_dim,
+                              input_shape_vec,
+                              col_buffer_shape_vec,
+                              filter_shape_vec,
+                              paddings,
+                              strides,
+                              dilations,
+                              deformable_groups,
+                              col_buffer_ptr);
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
+        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
+    // get the product of pixel and weight
+    for (int g = 0; g < groups; ++g) {
+      DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+      DenseTensor col_buffer_3d_slice =
+          col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+              col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
+      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
+      blas.MatMul(weight_3d_slice,
+                  false,
+                  col_buffer_3d_slice,
+                  false,
+                  T(1.0),
+                  &output_3d_slice,
+                  T(0.0));
+    }
+  }
+  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4356e9af39372cd330991502078a13520d05586
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace detail {
+
+template <typename T>
+struct FoundZeroFunctor {
+  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
+      : x_(x), numel_(numel), res_(res) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    if (*res_ || idx >= static_cast<size_t>(numel_)) {
+      // founded zero number
+      return;
+    }
+    *res_ = (x_[idx] == static_cast<T>(0));
+  }
+  const T* x_;
+  int64_t numel_;
+  bool* res_;
+};
+
+template <typename T, typename Context>
+inline bool CheckMatrixInvertible(const Context& dev_ctx,
+                                  const DenseTensor* det) {
+  auto numel = det->numel();
+
+  DenseTensor dev_tensor = phi::Empty<bool, Context>(dev_ctx, {1});
+
+  // set false
+  phi::funcs::SetConstant<Context, bool> zero;
+  zero(dev_ctx, &dev_tensor, false);
+
+  // find whether zero
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  FoundZeroFunctor<T> functor(det->data<T>(), numel, dev_tensor.data<bool>());
+  for_range(functor);
+
+  // copy to host
+  DenseTensor cpu_tensor;
+  phi::Copy<Context>(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor);
+
+  // if founded zero, the matrix is not invertible
+  // else the matrix is invertible
+  auto* res = cpu_tensor.data<bool>();
+  return !(*res);
+}
+
+}  // namespace detail
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad) {
+  auto input_dims_size = x.dims().size();
+  if (input_dims_size > 2) {
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size() + 2,
+        input_dims_size,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else if (input_dims_size == 2) {
+    // input dims size 2 and grad dims size 1 is possible
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else {
+    // checked in forward, pass
+  }
+
+  // Check Whether the matrix is invertible
+  // (matrix A not invertible) == (det(A)=0)
+  if (!detail::CheckMatrixInvertible<T, Context>(dev_ctx, &out)) {
+    // The matrix is not invertible
+    VLOG(3) << "The input matrix not invertible!";
+    x_grad->Resize(x.dims());
+    phi::Full<T>(
+        dev_ctx, phi::vectorize(x.dims()), static_cast<T>(0.0f), x_grad);
+    return;
+  }
+
+  // The matrix is invertible
+  // let |A| = Determinant(A)
+  // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+  // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
+  // -1)
+
+  // First: inverse(A)
+  DenseTensor inverse_A;
+  // A must be square matrices!
+  inverse_A.Resize(x.dims());
+  dev_ctx.template Alloc<T>(&inverse_A);
+
+  phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+  mat_inv(dev_ctx, x, &inverse_A);
+
+  VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+  // Second: inverse(A).transpose(-2, -1)
+  DenseTensor transpose_inverse_A =
+      phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
+  VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
+          << transpose_inverse_A.dims();
+
+  // Third: dA * |A|
+  auto mul_dA_detA = phi::Multiply<T>(dev_ctx, out_grad, out);
+  VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
+
+  // Fourth: unsqueeze(dA * |A|, [-1, -2])
+  auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+  auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
+  VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
+
+  // Finally: unsqueeze(dA * |A|) * inverse(A)
+  auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
+
+  VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
+
+  x_grad->Resize(x.dims());
+  VLOG(3) << "d|A| dims: " << x_grad->dims();
+
+  phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3a611b89c95c64184a953e5069c8b200317da46
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/LU>
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "paddle/phi/core/enforce.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+namespace detail {
+template <typename T>
+class EigenMatrix {};
+
+template <>
+class EigenMatrix<float> {
+ public:
+  using MatrixType = Eigen::MatrixXf;
+};
+
+template <>
+class EigenMatrix<double> {
+ public:
+  using MatrixType = Eigen::MatrixXd;
+};
+
+inline int64_t GetBatchCount(const DDim dims) {
+  int64_t batch_count = 1;
+  auto dim_size = dims.size();
+  PADDLE_ENFORCE_GE(
+      dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+
+  // Cumulative multiplying each dimension until the last 2 to get the batch
+  // count,
+  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
+  // 9.
+  for (int64_t i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+
+  return batch_count;
+}
+}  // namespace detail
+
+template <typename T, typename Context>
+struct DeterminantFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  int64_t rank,
+                  int64_t batch_count,
+                  DenseTensor* output) {
+    std::vector<T> input_vec;
+    std::vector<T> output_vec;
+    paddle::framework::TensorToVector(input, dev_ctx, &input_vec);
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
+      auto begin_iter = input_vec.begin() + i * rank * rank;
+      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
+      std::vector<T> sub_vec(begin_iter,
+                             end_iter);  // get every square matrix data
+      typename detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
+          matrix(i, j) = sub_vec[rank * i + j];
+        }
+      }
+      output_vec.push_back(matrix.determinant());
+    }
+    paddle::framework::TensorFromVector(output_vec, output);
+  }
+};
+
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out) {
+  auto input_dim = vectorize(x.dims());
+  auto input_dim_size = input_dim.size();
+
+  auto batch_count = detail::GetBatchCount(x.dims());
+  VLOG(10) << "input dim:" << x.dims();
+  PADDLE_ENFORCE_GE(
+      input_dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+  PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
+                    input_dim[input_dim_size - 2],
+                    phi::errors::InvalidArgument(
+                        "the input matrix should be square matrix."));
+  auto rank = input_dim[input_dim_size - 1];  // square matrix length
+  DeterminantFunctor<T, Context>()(dev_ctx, x, rank, batch_count, out);
+  auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2);
+  if (input_dim_size > 2) {
+    out->Resize(output_dims);
+  } else {
+    // when input is a two-dimension matrix, The det value is a number.
+    out->Resize({1});
+  }
+  VLOG(10) << "output dim:" << out->dims();
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 2f0530b638f5ea3d263f7c2b1a932a65ccaf3da2..5e06435b28e2719c2e9fc18de034073f9674a977 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -15,21 +15,15 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
-#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 65427e87506f70549c81acec714ce2f5ebdfc9b8..0b7a5d3bcb26a360eb5f7f664ead7932f428cc64 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -258,6 +258,102 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     dout_result.device(place) = static_cast<T>(-1) * dout_result;
   }
 }
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMaxGradDx<T>,
+                                          funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMaxGradDx<T>,
+                                            funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  }
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMinGradDx<T>,
+                                          funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMinGradDx<T>,
+                                            funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  }
+}
 
 template <typename T>
 struct MulGradDX {
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..775a91bf026d298a61315a7e2d7ebfbe92efb0b5
--- /dev/null
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#endif
+
+namespace phi {
+template <typename T, typename Context>
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..25247ceaff6c0a5f52a639176ce04c0589cbbd87
--- /dev/null
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct IscloseFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const float rtol,
+                  const float atol,
+                  bool equal_nan,
+                  DenseTensor* output);
+};
+
+template <typename T>
+struct GetTensorValue<phi::CPUContext, T> {
+  T operator()(const phi::CPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    paddle::memory::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+struct IscloseFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = ctx.template Alloc<bool>(output);
+    auto num = in.numel();
+    // *out_data = true;
+    for (int i = 0; i < num; i++) {
+      out_data[i] = true;
+    }
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      // *out_data &= val;
+      out_data[i] = val;
+    }
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+__global__ void IscloseCUDAKernel(const T* in_data,
+                                  const T* other_data,
+                                  const double rtol,
+                                  const double atol,
+                                  bool equal_nan,
+                                  int num,
+                                  bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct IscloseFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = dev_ctx.template Alloc<bool>(output);
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+    hipMemset(out_data, true, num * sizeof(bool));
+#else
+    cudaMemset(out_data, true, num * sizeof(bool));
+#endif
+    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+#endif
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Atol) type must be double"));
+
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Rtol) type must be double"));
+
+  IscloseFunctor<Context, T>()(
+      dev_ctx, x, y, rtol.to<double>(), atol.to<double>(), equal_nan, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae90960ef4455deff5f9b7e08ed9eb7ff62cba3
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossBackward {
+  HOSTDEVICE KLDivLossBackward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& grad) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return static_cast<T>(-1.) * grad;
+    }
+  }
+};
+
+template <typename T, typename Context>
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x) {
+  auto& place = *dev_ctx.eigen_device();
+  auto* target = &label;
+  auto* input_grad = d_x;
+  auto* loss_grad = &d_out;
+
+  const int n = input_grad->dims()[0];
+  const int numel = input_grad->numel();
+  const int expand = numel / loss_grad->numel();
+
+  dev_ctx.template Alloc<T>(input_grad);
+
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+
+  auto input_grad_t = phi::EigenVector<T>::Flatten(*input_grad);
+  auto loss_grad_t = phi::EigenVector<T>::Flatten(*loss_grad);
+
+  auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
+  auto grad_t = target_t * loss_grad_expand;
+  input_grad_t.device(place) =
+      target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
+
+  if ("mean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
+  } else if ("batchmean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecd23bbfc1c4598c953555a1f0d21dd0f6a989c8
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossForward {
+  HOSTDEVICE KLDivLossForward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& input) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return target * (std::log(target) - input);
+    }
+  }
+};
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out) {
+  auto& place = *(dev_ctx.eigen_device());
+  auto* input = &x;
+  auto* target = &label;
+  auto* loss = out;
+
+  const int n = input->dims()[0];
+  dev_ctx.template Alloc<T>(loss);
+
+  auto input_t = phi::EigenVector<T>::Flatten(*input);
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+  auto loss_t = phi::EigenVector<T>::Flatten(*loss);
+  auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
+  if ("none" == reduction) {
+    loss_t.device(place) = output;
+  } else if ("batchmean" == reduction) {
+    auto output_sum = output.sum();
+    if (n > 0) {
+      loss_t.device(place) = output_sum / output_sum.constant(n);
+    } else {
+      loss_t.device(place) = output_sum;
+    }
+  } else if ("mean" == reduction) {
+    loss_t.device(place) = output.mean();
+  } else if ("sum" == reduction) {
+    loss_t.device(place) = output.sum();
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..30297b53eabb99c4fcccc5c3c7faa04f86d4bb93
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -0,0 +1,295 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+struct KronGradElemFunctor {
+  KronGradElemFunctor(const T* dout,
+                      const T* A,
+                      const T* B,
+                      T* dout_a,
+                      T* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
+    }
+  }
+
+ private:
+  const T* dout_;
+  const T* A_;
+  const T* B_;
+  T* dout_a_;
+  T* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename T>
+struct KronGradElemFunctor<dtype::complex<T>> {
+  KronGradElemFunctor(const dtype::complex<T>* dout,
+                      const dtype::complex<T>* A,
+                      const dtype::complex<T>* B,
+                      dtype::complex<T>* dout_a,
+                      dtype::complex<T>* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] =
+          dout_[idx] * dtype::complex<T>(B_[index_b].real, -B_[index_b].imag);
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] =
+          dout_[idx] * dtype::complex<T>(A_[index_a].real, -A_[index_a].imag);
+    }
+  }
+
+ private:
+  const dtype::complex<T>* dout_;
+  const dtype::complex<T>* A_;
+  const dtype::complex<T>* B_;
+  dtype::complex<T>* dout_a_;
+  dtype::complex<T>* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronGradOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& dout,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* dx,
+                  DenseTensor* dy) {
+    int ndims = dout.dims().size();
+    int64_t numel = dout.numel();
+    int64_t numel_x = x.numel();
+    int64_t numel_y = y.numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_dout = dout.dims();
+
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_dout = phi::stride(dim_dout);
+
+    const int64_t* p_stride_x = nullptr;
+    const int64_t* p_stride_y = nullptr;
+    const int64_t* p_stride_dout = nullptr;
+    const int64_t* p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_dout(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_dout.Get(), stride_dout.Get() + ndims, d_stride_dout.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_dout = stride_dout.Get();
+    p_shape_y = dim_y.Get();
+#endif
+    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
+    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
+    DenseTensor dout_x;
+    T* p_dout_x = nullptr;
+    if (dx) {
+      dout_x.Resize({numel_x, numel_y});
+      dev_ctx.template Alloc<T>(&dout_x);
+      p_dout_x = dout_x.data<T>();
+    }
+    DenseTensor dout_y;
+    T* p_dout_y = nullptr;
+    if (dy) {
+      dout_y.Resize({numel_y, numel_x});
+      dev_ctx.template Alloc<T>(&dout_y);
+      p_dout_y = dout_y.data<T>();
+    }
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronGradElemFunctor<T> func(dout.data<T>(),
+                                x.data<T>(),
+                                y.data<T>(),
+                                p_dout_x,
+                                p_dout_y,
+                                p_stride_dout,
+                                p_stride_x,
+                                p_stride_y,
+                                p_shape_y,
+                                numel_x,
+                                numel_y,
+                                ndims);
+    for_range(func);
+
+// reduce_sum along aixs 1
+#if defined(__NVCC__) || defined(__HIPCC__)
+    auto stream = dev_ctx.stream();  // it is a cuda device_context
+    if (dx) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1});
+    }
+    if (dy) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1});
+    }
+#else
+    auto* place = dev_ctx.eigen_device();
+    Eigen::array<int, 1> reduce_dim = {1};
+    if (dx) {
+      auto eigen_dout_x = EigenMatrix<T>::Reshape(dout_x, 1);
+      auto eigen_vec_dx = EigenVector<T>::Flatten(*dx);
+      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
+    }
+    if (dy) {
+      auto eigen_dout_y = EigenMatrix<T>::Reshape(dout_y, 1);
+      auto eigen_vec_dy = EigenVector<T>::Flatten(*dy);
+      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
+    }
+#endif
+  }
+};
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  if (x_grad) {
+    ctx.template Alloc<T>(x_grad);
+  }
+  if (y_grad) {
+    ctx.template Alloc<T>(y_grad);
+  }
+
+  int ndims = out_grad.dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  DenseTensor* pdxx = nullptr;
+  DenseTensor* pdyy = nullptr;
+  DenseTensor dxx;
+  DenseTensor dyy;
+  if (x_grad) {
+    dxx = UnsqueezeTo(*x_grad, ndims);
+    pdxx = &dxx;
+  }
+
+  if (y_grad) {
+    dyy = UnsqueezeTo(*y_grad, ndims);
+    pdyy = &dyy;
+  }
+
+  KronGradOpFunctor<Context, T> func;
+  func(ctx, out_grad, xx, yy, pdxx, pdyy);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..47c76f59df23bfee68a2660b76a09df747048378
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "thrust/device_vector.h"
+#endif
+
+namespace phi {
+
+inline DenseTensor UnsqueezeTo(const DenseTensor& src, int ndims) {
+  const phi::DDim& shape = src.dims();
+  int rank = shape.size();
+  DenseTensor res;
+  res.ShareDataWith(src);
+  PADDLE_ENFORCE_LE(
+      rank,
+      ndims,
+      errors::InvalidArgument(
+          "The input Tensor's rank should be less than or equal to ndims"
+          "Received input Tensor's rank = %d, ndims = %d",
+          rank,
+          ndims));
+  if (rank < ndims) {
+    std::vector<int64_t> new_dim(ndims, 1);
+    for (int i = ndims - rank; i < ndims; i++) {
+      new_dim[i] = shape[i - ndims + rank];
+    }
+    res.Resize(phi::make_ddim(new_dim));
+  }
+  return res;
+}
+
+template <typename T>
+struct KronElemFunctor {
+  KronElemFunctor(const T* a,
+                  const T* b,
+                  T* out,
+                  const int64_t* shape_b,
+                  const int64_t* stride_a,
+                  const int64_t* stride_b,
+                  const int64_t* stride_out,
+                  int ndims)
+      : a_(a),
+        b_(b),
+        out_(out),
+        shape_b_(shape_b),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        stride_out_(stride_out),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    // it computes 1 element in the output
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_out_[i];
+      index = index % stride_out_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+    out_[idx] = a_[index_a] * b_[index_b];
+  }
+
+ private:
+  const T* a_;
+  const T* b_;
+  T* out_;
+  const int64_t* shape_b_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* stride_out_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+    int ndims = out->dims().size();
+    int64_t numel = out->numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_out = out->dims();
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_out = phi::stride(dim_out);
+
+    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
+                  *p_stride_out = nullptr, *p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_out(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_out.Get(), stride_out.Get() + ndims, d_stride_out.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_out = stride_out.Get();
+    p_shape_y = dim_y.Get();
+#endif
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronElemFunctor<T> functor(x.data<T>(),
+                               y.data<T>(),
+                               out->data<T>(),
+                               p_shape_y,
+                               p_stride_x,
+                               p_stride_y,
+                               p_stride_out,
+                               ndims);
+    for_range(functor);
+  }
+};
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  int ndims = out->dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  KronOpFunctor<Context, T> func;
+  func(ctx, xx, yy, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fb1f1c4fa3615cbf33fb7b6e4b0609dbcc2c3a0
--- /dev/null
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/kernels/funcs/for_range.h"
+namespace phi {
+template <typename T>
+struct LgammaGradFunctor {
+  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
similarity index 72%
rename from paddle/fluid/operators/matrix_rank_op.h
rename to paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
index 93545fd31037ada823d35af5b5bad809ebf3d773..b0dd76a17eeb363d53f29ba3e6cb3e5bf209edfc 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,14 +13,11 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/phi/core/ddim.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+
+namespace phi {
 
 namespace detail {
 static DDim GetEigenvalueDim(const DDim& dim, int k) {
@@ -44,6 +41,18 @@ static DDim RemoveLastDim(const DDim& dim) {
   vec.erase(vec.end() - 1, vec.end());
   return phi::make_ddim(vec);
 }
+
+static DDim GetUDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 1] = k;
+  return phi::make_ddim(x_vec);
+}
+
+static DDim GetVHDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 2] = k;
+  return phi::make_ddim(x_vec);
+}
 }  // namespace detail
 
 template <typename T>
@@ -57,5 +66,4 @@ struct GreaterElementFunctor {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fe89ce34c8b5a33df12c1931caeddb37de5aea2
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -0,0 +1,332 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       DenseTensor* dx) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_constant;
+    set_constant(ctx, dx, static_cast<T>(0.0));
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          funcs::MaxPool2dGradFunctor<Context, T> pool2d_backward;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool2dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool2d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          funcs::MaxPool3dGradFunctor<Context, T> pool3d_backward;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool3dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool3d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexGradRawKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& mask,
+                                   const DenseTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   DenseTensor* dx) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(dx->dims()[i + 2]);
+    }
+  }
+
+  if (dx) {
+    ctx.template Alloc<T1>(dx);
+    funcs::set_constant(ctx, dx, 0);
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        funcs::MaxPool2dWithIndexGradFunctor<Context, T1, T2> pool2d_backward;
+        pool2d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      case 3: {
+        funcs::MaxPool3dWithIndexGradFunctor<Context, T1, T2> pool3d_backward;
+        pool3d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dKernel<T, Context>(ctx,
+                             x,
+                             kernel_size,
+                             strides,
+                             paddings,
+                             ceil_mode,
+                             exclusive,
+                             data_format,
+                             pooling_type,
+                             global_pooling,
+                             adaptive,
+                             padding_algorithm,
+                             out);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..665d02fd0173e0b4dec7de7bfbf89cfa13d92f3f
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -0,0 +1,321 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#if defined(__HIPCC__) || defined(__NVCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#endif
+
+namespace phi {
+
+inline int GetReduceNum(const DenseTensor& input,
+                        const DenseTensor* output,
+                        const std::string data_format,
+                        std::vector<int>* reduce_dim) {
+  // data_format only can be NCHW
+  bool channel_last = (data_format == "NHWC");
+  if (channel_last) {
+    return 0;
+  }
+  int reduce_num = 0;
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  if ((output_height == 1) && (output_width == 1)) {
+    reduce_dim->push_back(2);
+    reduce_dim->push_back(3);
+    reduce_num = input.dims()[2] * input.dims()[3];
+  }
+  return reduce_num;
+}
+
+template <typename T, typename Context>
+void PoolRawKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   DenseTensor* out) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      if (pooling_type == "max") {
+        funcs::Pool2dFunctor<Context, funcs::MaxPool<T>, T> pool2d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool2d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+
+      } else if (pooling_type == "avg") {
+        std::vector<int> reduce_dim;
+        int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim);
+        if (reduce_num > 0 &&
+            adaptive) {  // for adaptive_avg_pool2d && output_size == 1
+#if defined(__HIPCC__) || defined(__NVCC__)
+          auto stream = ctx.stream();
+          funcs::ReduceKernel<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
+              ctx, x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim);
+#else  // for cpu
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+#endif
+        } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+        }
+      }
+    } break;
+    case 3: {
+      if (pooling_type == "max") {
+        funcs::Pool3dFunctor<Context, funcs::MaxPool<T>, T> pool3d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+      } else if (pooling_type == "avg") {
+        funcs::Pool3dFunctor<Context, funcs::AvgPool<T>, T> pool3d_forward;
+        funcs::AvgPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       exclusive,
+                       adaptive,
+                       out,
+                       pool_process);
+      }
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexRawKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               DenseTensor* out,
+                               DenseTensor* mask) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x.dims()[i + 2]);
+    }
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      funcs::MaxPool2dWithIndexFunctor<Context, T1, T2> pool2d_forward;
+      pool2d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    case 3: {
+      funcs::MaxPool3dWithIndexFunctor<Context, T1, T2> pool3d_forward;
+      pool3d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h
similarity index 100%
rename from paddle/phi/kernels/cpu/reduce_grad.h
rename to paddle/phi/kernels/impl/reduce_grad.h
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a74416e3916492e6d3a40e09ca347db485fff7c
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..baaa544f137366f1e0343c25bc373cc08350f7fd
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b93e98cec0168ab55e15e3401a72738f79d3a07
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
+                                                       x,
+                                                       out_grad,
+                                                       out,
+                                                       dims,
+                                                       keep_dim,
+                                                       reduce_all,
+                                                       in_dtype,
+                                                       out_dtype,
+                                                       x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/searchsorted_op.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
similarity index 58%
rename from paddle/fluid/operators/searchsorted_op.h
rename to paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 6aa38a815813230c2921f3d3816881966df6bf98..82bd9fba2a66d7a4601b5aab360b9bbf80ff04d9 100644
--- a/paddle/fluid/operators/searchsorted_op.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,16 +16,11 @@
 
 #include <math.h>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
+namespace phi {
 
 template <typename T1, typename T2, typename OutType>
 class GpuAndCpuSearchSortedCompute {
@@ -65,9 +60,11 @@ class GpuAndCpuSearchSortedCompute {
   static HOSTDEVICE bool IsInf(int64_t x) { return false; }
 
   HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data,
-                                          const T2* value_data, bool right,
+                                          const T2* value_data,
+                                          bool right,
                                           bool is_1d_boundaries,
-                                          int64_t val_size, int64_t seq_size,
+                                          int64_t val_size,
+                                          int64_t seq_size,
                                           OutType* out_data)
       : sequence_data_(sequence_data),
         value_data_(value_data),
@@ -104,12 +101,13 @@ class GpuAndCpuSearchSortedCompute {
   OutType* out_data_;
 };
 
-template <typename DeviceContext, typename T1, typename OutType>
+template <typename Context, typename T1, typename OutType>
 class SearchSortedFunctor {
  public:
-  SearchSortedFunctor(const framework::ExecutionContext& context,
-                      const framework::Tensor* sorted_sequence,
-                      const framework::Tensor* value, bool right,
+  SearchSortedFunctor(const Context& context,
+                      const DenseTensor* sorted_sequence,
+                      const DenseTensor* value,
+                      bool right,
                       OutType* out_data)
       : context_(context),
         sorted_sequence_(sorted_sequence),
@@ -121,74 +119,73 @@ class SearchSortedFunctor {
   void apply() {
     const T1* sequence_data = sorted_sequence_->data<T1>();
     const T2* value_data = value_->data<T2>();
-    const framework::DDim& seq_dims = sorted_sequence_->dims();
-    const framework::DDim& val_dims = value_->dims();
+    const phi::DDim& seq_dims = sorted_sequence_->dims();
+    const phi::DDim& val_dims = value_->dims();
 
     bool is_1d_boundaries = seq_dims.size() == 1;
     int64_t val_size = val_dims[val_dims.size() - 1];
     int64_t seq_size = seq_dims[seq_dims.size() - 1];
 
-    auto& dev_ctx = context_.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, value_->numel());
+    funcs::ForRange<Context> for_range(context_, value_->numel());
     GpuAndCpuSearchSortedCompute<T1, T2, OutType>
-        gpu_and_cpu_search_sorted_compute(sequence_data, value_data, right_,
-                                          is_1d_boundaries, val_size, seq_size,
+        gpu_and_cpu_search_sorted_compute(sequence_data,
+                                          value_data,
+                                          right_,
+                                          is_1d_boundaries,
+                                          val_size,
+                                          seq_size,
                                           out_data_);
     for_range(gpu_and_cpu_search_sorted_compute);
   }
 
  private:
-  const framework::ExecutionContext& context_;
-  const framework::Tensor* sorted_sequence_;
-  const framework::Tensor* value_;
+  const Context& context_;
+  const DenseTensor* sorted_sequence_;
+  const DenseTensor* value_;
   bool right_;
   OutType* out_data_;
 };
 
 template <typename Visitor>
-static void VisitDataType(framework::proto::VarType::Type type,
-                          Visitor visitor) {
-  if (type == framework::proto::VarType::FP32) {
+static void VisitDataType(DataType type, Visitor visitor) {
+  if (type == DataType::FLOAT32) {
     visitor.template apply<float>();
-  } else if (type == framework::proto::VarType::FP64) {
+  } else if (type == DataType::FLOAT64) {
     visitor.template apply<double>();
-  } else if (type == framework::proto::VarType::INT32) {
+  } else if (type == DataType::INT32) {
     visitor.template apply<int>();
-  } else if (type == framework::proto::VarType::INT64) {
+  } else if (type == DataType::INT64) {
     visitor.template apply<int64_t>();
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(errors::InvalidArgument(
         "The recieved values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
-        framework::DataTypeToString(type)));
+        type));
   }
 }
 
-template <typename DeviceContext, typename T>
-class SearchSortedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* sorted_sequence = context.Input<Tensor>("SortedSequence");
-    auto* value = context.Input<Tensor>("Values");
-    bool out_int32 = context.Attr<bool>("out_int32");
-    bool right = context.Attr<bool>("right");
-    auto* out = context.Output<Tensor>("Out");
-
-    if (out_int32) {
-      int* out_data = out->mutable_data<int>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    } else {
-      int64_t* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int64_t> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    }
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out) {
+  if (out_int32) {
+    ctx.template Alloc<int>(out);
+    int* out_data = out->data<int>();
+    SearchSortedFunctor<Context, T, int> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
+  } else {
+    ctx.template Alloc<int64_t>(out);
+    int64_t* out_data = out->data<int64_t>();
+    SearchSortedFunctor<Context, T, int64_t> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
   }
-};
+}
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4947170088cba9701ad1065098451b97139bfc95
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/strided_slice_op.h"
+
+namespace phi {
+
+inline void GetOffsets(const DDim& big_dim,
+                       const DDim& small_dim,
+                       DDim start_offset,
+                       int cur_dim,
+                       std::vector<DDim>* offsets) {
+  if (cur_dim == big_dim.size()) {
+    offsets->push_back(start_offset);
+    return;
+  }
+  if (small_dim[cur_dim] == big_dim[cur_dim]) {
+    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+  } else {
+    for (int i = 0; i < big_dim[cur_dim]; i++) {
+      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+      start_offset[cur_dim] += 1;
+    }
+  }
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueGradImpl(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const ScalarArray& starts,
+                      const ScalarArray& ends,
+                      const ScalarArray& steps,
+                      const std::vector<int64_t>& axes,
+                      const std::vector<int64_t>& decrease_axes,
+                      const std::vector<int64_t>& none_axes,
+                      DenseTensor* x_grad,
+                      DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      out_grad.IsInitialized(),
+      true,
+      errors::PermissionDenied(
+          "The input of `set_value_grad`(out_grad) has not been initialized"));
+
+  auto in_dims = out_grad.dims();
+
+  std::vector<int> decrease_axis_int32(decrease_axes.begin(),
+                                       decrease_axes.end());
+  std::vector<int> axes_int32(axes.begin(), axes.end());
+  std::vector<int> infer_flags(axes.size(), 1);
+  std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::StridedSliceOutDims(starts_local,
+                                         ends_local,
+                                         steps_local,
+                                         axes_int32,
+                                         infer_flags,
+                                         in_dims,
+                                         decrease_axis_int32,
+                                         out_dims_vector.data(),
+                                         axes.size(),
+                                         false);
+
+  DDim out_dims(phi::make_ddim(out_dims_vector));
+
+  std::vector<int> reverse_vector(starts_local.size(), 0);
+  paddle::operators::StridedSliceFunctor(starts_local.data(),
+                                         ends_local.data(),
+                                         steps_local.data(),
+                                         axes_int32.data(),
+                                         reverse_vector.data(),
+                                         in_dims,
+                                         infer_flags,
+                                         decrease_axis_int32,
+                                         starts_local.size());
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto reverse_axis = Eigen::array<bool, RANK>();
+
+  for (size_t axis = 0; axis < RANK; axis++) {
+    starts_indices[axis] = 0;
+    ends_indices[axis] = out_dims[axis];
+    steps_indices[axis] = 1;
+    reverse_axis[axis] = false;
+  }
+
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    int axis_index = axes[axis];
+    starts_indices[axis_index] = starts_local[axis];
+    ends_indices[axis_index] = ends_local[axis];
+    steps_indices[axis_index] = steps_local[axis];
+    reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
+  }
+
+  bool need_reverse = false;
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    if (reverse_vector[axis] == 1) {
+      need_reverse = true;
+      break;
+    }
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (x_grad) {
+    // Set gradient of `Input`
+    Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+
+    auto x_grad_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(*x_grad);
+
+    DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+    auto tmp_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+    x_grad_t.stridedSlice(starts_indices, ends_indices, steps_indices)
+        .device(place) = tmp_t;
+  }
+  if (value_grad) {
+    dev_ctx.template Alloc<T>(value_grad);
+    set_zero(dev_ctx, value_grad, static_cast<T>(0));
+
+    auto in_t = EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+        out_grad);
+
+    if (value_grad->dims() == out_dims) {
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad);
+      if (need_reverse) {
+        DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+        auto tmp_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+        tmp_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+        value_grad_t.device(place) = tmp_t.reverse(reverse_axis);
+      } else {
+        value_grad_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+      }
+    } else {
+      int out_dims_size = out_dims.size();
+      auto value_grad_dims = value_grad->dims();
+      auto fake_value_grad_dims = out_dims;
+
+      // Create an extented shape according to the rules of broadcast.
+      auto value_grad_dims_size = value_grad_dims.size();
+
+      int num_decrease = 0;
+
+      int decrease_axis_size = decrease_axes.size();
+      for (int i = 0; i < out_dims_size; i++) {
+        if (decrease_axes.end() !=
+            std::find(decrease_axes.begin(), decrease_axes.end(), i)) {
+          fake_value_grad_dims[i] = 1;
+          num_decrease++;
+        } else if (i < out_dims_size - (value_grad_dims_size +
+                                        decrease_axis_size - num_decrease)) {
+          fake_value_grad_dims[i] = 1;
+        } else {
+          auto index_grad =
+              i - (out_dims_size -
+                   (value_grad_dims_size + decrease_axis_size - num_decrease));
+          fake_value_grad_dims[i] = value_grad_dims[index_grad];
+
+          PADDLE_ENFORCE_EQ((out_dims[i] == value_grad_dims[index_grad]) ||
+                                (value_grad_dims[index_grad] == 1),
+                            true,
+                            errors::InvalidArgument(
+                                "An error occurred while calculating %s: "
+                                "[%s] can not be accumulated into [%s].",
+                                paddle::framework::GradVarName("ValueTensor"),
+                                out_dims,
+                                value_grad_dims));
+        }
+      }
+
+      VLOG(3) << "Dimensions of "
+              << paddle::framework::GradVarName("ValueTensor") << "(["
+              << value_grad_dims << "])is broadcasted into ["
+              << fake_value_grad_dims << "].";
+
+      auto extent = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+      auto offset = out_dims;
+      for (int i = 0; i < out_dims_size; i++) {
+        offset[i] = 0;
+        extent[i] = fake_value_grad_dims[i];
+      }
+      std::vector<DDim> offsets;
+      GetOffsets(out_dims, fake_value_grad_dims, offset, 0, &offsets);
+
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad, fake_value_grad_dims);
+
+      DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+      auto tmp_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+      tmp_t.device(place) =
+          in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+
+      // accumulate gradient
+      for (auto offset : offsets) {
+        value_grad_t.device(place) =
+            value_grad_t + tmp_t.slice(EigenDim<RANK>::From(offset), extent);
+      }
+      if (need_reverse) {
+        DenseTensor tmp_value =
+            Full<T>(dev_ctx,
+                    {fake_value_grad_dims.Get(), fake_value_grad_dims.size()},
+                    static_cast<T>(0));
+        auto tmp_value_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+                tmp_value);
+        tmp_value_t.device(place) = value_grad_t.reverse(reverse_axis);
+        value_grad_t.device(place) = tmp_value_t;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad) {
+  const int rank = out_grad.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueGradImpl<T, Context, 1>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 2:
+      SetValueGradImpl<T, Context, 2>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 3:
+      SetValueGradImpl<T, Context, 3>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 4:
+      SetValueGradImpl<T, Context, 4>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 5:
+      SetValueGradImpl<T, Context, 5>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 6:
+      SetValueGradImpl<T, Context, 6>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "The rank of set_value_grad's input should be less than 7, but "
+          "received %d.",
+          rank));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 5aebffe51b5e388bcc7ea72d1b804ffcb8768821..99db559f3b8166258a80814859c3296933634db8 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -25,7 +25,6 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/slice_utils.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
index 6552f6ed581f45008f01c02fad3c007bf3664942..7aa43fdb7f27056d5cb4c2947e2764bd8868ff02 100644
--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out) {
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DenseTensor* out) {
   const int rank = x.dims().size();
   const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
   int axis_dim = x.dims()[calc_axis];
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index 9b1e4b1d3a65d5c0da831a36152cff85a3353fa3..044adb0230cac4d0dc6bf9e9348968e4d7c60b5d 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -21,12 +21,11 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
     const auto H = dims[dims.size() - 2];
     const auto W = dims[dims.size() - 1];
     phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
-    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+    phi::funcs::TrilTriuCompute<T> tril_triu_functor(
         dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
     x_for_range(tril_triu_functor);
 
diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcc7224b5075ca77db813089af6048f0809c9f35
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad) {
+  const auto* dout_data = out_grad.data<T>();
+  auto* dx_data = ctx.template Alloc<T>(x_grad);
+
+  const auto& dims = out_grad.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+
+  phi::funcs::ForRange<Context> for_range(
+      ctx, static_cast<size_t>(out_grad.numel()));
+  phi::funcs::TrilTriuCompute<T> tril_triu_grad_computer(
+      dout_data, diagonal, lower, H, W, dx_data);
+  for_range(tril_triu_grad_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..959169d87cefd877a4fb056218dd761a96f23136
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out) {
+  const auto* x_data = x.data<T>();
+  auto* out_data = ctx.template Alloc<T>(out);
+
+  const auto& dims = x.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+  phi::funcs::ForRange<Context> for_range(ctx, static_cast<size_t>(x.numel()));
+
+  phi::funcs::TrilTriuCompute<T> tril_triu_computer(
+      x_data, diagonal, lower, H, W, out_data);
+  for_range(tril_triu_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3dc1595989bf2879c3e20187eaa53b6df75a7f0
--- /dev/null
+++ b/paddle/phi/kernels/index_select_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..124b6897311575223859fba882488a535a6310f4
--- /dev/null
+++ b/paddle/phi/kernels/index_select_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c468da055082005568002952f695ebedda31c3c
--- /dev/null
+++ b/paddle/phi/kernels/isclose_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f53898fa6816d198d1bc96bcf99f20752a70551
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+// XKTODO (change name)
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..103780ab747282f9171d2a409db9fd4e5269ba4d
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_grad_kernel.h b/paddle/phi/kernels/kron_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3daa9dcfba9f0d89bd8dec88905f0ddb321f630a
--- /dev/null
+++ b/paddle/phi/kernels/kron_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_kernel.h b/paddle/phi/kernels/kron_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4451ac757a9534f4a48db97da81acc2047c26be2
--- /dev/null
+++ b/paddle/phi/kernels/kron_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..488dde8237b0882b6606834f8a510ef360da1b24
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_grad_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4809b9af4832f5d9c036d26adfc6ba5c7a808889
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* out,
+                    DenseTensor* indices);
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c32be63db4178f92d9564f357c30bb28fb415516
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale,
+                         paddle::optional<const DenseTensor&> bias,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9679420bda5cf6beffb56b7ec319c1b80ac4eda
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale,
+                     paddle::optional<const DenseTensor&> bias,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* out,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class LayerNormDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* input,
+                  std::vector<int> input_shape,
+                  const T* bias,
+                  const T* scale,
+                  T* output,
+                  T* mean,
+                  T* variance,
+                  int begin_norm_axis,
+                  float eps);
+};
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..94173cc29c7a7b7c71b6c781fbe06f9c991a4197
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/phi/kernels/lgamma_kernel.h
similarity index 63%
rename from paddle/fluid/operators/reduce_ops/reduce_any_op.cu
rename to paddle/phi/kernels/lgamma_kernel.h
index 2e93e67debbd9d7f8667e0b2994fdd440401ac13..f61b3a1ce859eeee77d49ecb38ee0e96c3a9f0ee 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/phi/kernels/lgamma_kernel.h
@@ -1,4 +1,5 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any,
-    ops::ReduceCudaKernel<bool, kps::LogicalOrFunctor, kps::IdentityFunctor>);
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6336bc14105bb55deacbfdc20a69a56c6ceca81a
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2caaa86d46c35888c5aaa944019c070f0dd64e17
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/matrix_rank_kernel.h
similarity index 65%
rename from paddle/phi/kernels/impl/shape_kernel_impl.h
rename to paddle/phi/kernels/matrix_rank_kernel.h
index 982cfb33f6b14fc14c7c58ff8c4548a4cdbd3b3b..6edea2723e589340f2c6dc3cfb0be6f895bf08bb 100644
--- a/paddle/phi/kernels/impl/shape_kernel_impl.h
+++ b/paddle/phi/kernels/matrix_rank_kernel.h
@@ -19,18 +19,11 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void ShapeKernel(const Context& ctx,
-                 const DenseTensor& input,
-                 DenseTensor* out) {
-  auto in_var = &input;
-  phi::DDim in_dims;
-  in_dims = in_var->dims();
-  auto out_t = out;
-  out_t->Resize({in_dims.size()});
-  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = in_dims[i];
-  }
-}
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/matrix_rank_tol_kernel.h b/paddle/phi/kernels/matrix_rank_tol_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..351358dfa04aa7ad2091ad0e01bc63e50046eda0
--- /dev/null
+++ b/paddle/phi/kernels/matrix_rank_tol_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccde8c3648fa556401f1937c78039743daf43f4c
--- /dev/null
+++ b/paddle/phi/kernels/mode_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..831c4369304e5c5d27cddf01bcba021745bf7083
--- /dev/null
+++ b/paddle/phi/kernels/mode_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h
similarity index 66%
rename from paddle/phi/kernels/reduce_sum_grad_kernel.h
rename to paddle/phi/kernels/multiplex_grad_kernel.h
index ab4d63297efffc70710e496efa08f4b9c7e5f7ce..b32c9dbe100584f7076f34d848d3e5112315f83d 100644
--- a/paddle/phi/kernels/reduce_sum_grad_kernel.h
+++ b/paddle/phi/kernels/multiplex_grad_kernel.h
@@ -14,19 +14,14 @@
 
 #pragma once
 
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename T, typename Context>
-void ReduceSumGradKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
                          const DenseTensor& out_grad,
-                         const std::vector<int64_t>& dims,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType in_dtype,
-                         DataType out_dtype,
-                         DenseTensor* x_grad);
+                         std::vector<DenseTensor*> ins_grad);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..341c6d5cabb7ce1d67090c7533bc8c45622f4786
--- /dev/null
+++ b/paddle/phi/kernels/multiplex_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
similarity index 62%
rename from paddle/phi/kernels/reduce_max_kernel.cc
rename to paddle/phi/kernels/one_hot_kernel.cc
index de172a12d72884fb018acbb42c077efc825508ce..633f48cbb62ace9e3f7f21502bd61f8c305fb542 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -20,20 +20,19 @@
 namespace phi {
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+void OneHotKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& num_classes_s,
+                  DenseTensor* out) {
+  int num_classes = num_classes_s.to<int>();
+  OneHotRawKernel<T>(
+      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
similarity index 64%
rename from paddle/phi/kernels/reduce_max_kernel.h
rename to paddle/phi/kernels/one_hot_kernel.h
index 7560473d43c718a80cfb8911cd250ef8fc74d82c..9f89609ea63365b0e7831201ca003d6c7320c5d7 100644
--- a/paddle/phi/kernels/reduce_max_kernel.h
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -14,25 +14,23 @@
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void OneHotKernel(const Context& dev_ctx,
                   const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
+                  const Scalar& num_classes,
                   DenseTensor* out);
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0658dc22c823bf7ae162fb2e392f256cfb051496
--- /dev/null
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..348af021815175ca2c6c94b9721fec33fbaf864c
--- /dev/null
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..15917e2e1f02e896d12e971e7dfa52685f57a676
--- /dev/null
+++ b/paddle/phi/kernels/prelu_grad_kernel.h
@@ -0,0 +1,31 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..251332a8158dcbfa45cbb6c183e06789c21894db
--- /dev/null
+++ b/paddle/phi/kernels/prelu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 632ad00f6d06ed8a02b2d9677ff665c677cf8cb9..e02f4450a8babb9dd90cae6d8d1622938ae2f795 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,7 +22,6 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
@@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
       temp[index + index / 32] =
-          compute(temp[index + index / 2],
+          compute(temp[index + index / 32],
                   temp[index - stride + (index - stride) / 32]);
     }
   }
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2f1e2f589c5122987d9776700f3aa7bd95daa7a5..1d4181f3b9a89509ada2a8fe27d584a9b5aa039c 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -115,6 +115,14 @@ struct BroadcastConfig {
   }
 };
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details
 
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 53a8b7d0c9ef9489056ab293d97e5767b23531fe..d2cfdbdec3064c8e9cf20d101afc2adf0ed011a8 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -76,6 +76,16 @@ struct BroadcastConfig {
 };
 #pragma pack()
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+                                          T* src,
+                                          int num) {
+  if (num > 0) {
+    LM2GM(src, dst, num * sizeof(T));
+  }
+}
+#undef INT_BITS
+
 }  // namespace details
 
 /**
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index 830bc1972c49fe8c447e9a13f874841d36a12f2d..b5a1e88acc32b1b101a6f81b750be1c669236a1a 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
+
+// macro
 #ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
@@ -22,11 +25,6 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
-
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -42,11 +40,8 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+
 #else
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
@@ -67,4 +62,22 @@
 #define GRID_NUM_X gridDim.x
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+
+#endif
+
+// include file
+#ifdef PADDLE_WITH_XPU_KP
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
+#else
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
 #endif
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..87163eb8e079ffd580d6f937179e24a8506376e9
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..341037af2caeca28e211da8862e3c8d6089b9bac
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c3dfb16601267ec8a1d2535f6854c2a31dba5a8
--- /dev/null
+++ b/paddle/phi/kernels/qr_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee6f3d19a094d29546e82e7138933eceb96459d0
--- /dev/null
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7638c782d547d6b69d0c740827abf96e3ffda0c5
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  ProdRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..69bcb47bc98eadd46eeff5c1f92ccf9cf0c9a9d3
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eea1fa03886a4a02dbc614052e1f280c2610f1ad
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9734da53b7f453d492cc60ee8930f54e7ca74edc
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7f1c378f75c398a714f6aa4e4d857e314f47eeb
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPooGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& boxes,
+                      paddle::optional<const DenseTensor&> boxes_num,
+                      const DenseTensor& arg_max,
+                      const DenseTensor& out_grad,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff6f223612a46c00abff103c3b3a193264b122
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+static constexpr int kROISize = 4;
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..331f3626e56574615a2d6b1680335638b060846d
--- /dev/null
+++ b/paddle/phi/kernels/roll_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..56f32174a4c0005968acf147b2daf25914ff01b1
--- /dev/null
+++ b/paddle/phi/kernels/roll_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/searchsorted_kernel.h b/paddle/phi/kernels/searchsorted_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e425c7fd7955544cc429fb0a071e8f8038b47063
--- /dev/null
+++ b/paddle/phi/kernels/searchsorted_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fae876facfc8fae9b2db783576444ac8bfde09a1
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+
+namespace phi {
+namespace sr {
+
+// Note: use `const paddle::optional<const SelectedRows&> x`
+// as input if needed
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::AssignKernel<Context>(dev_ctx, x.value(), out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ba465615a73a3036d4b029c8ecb54002b86cb97
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf71ab0583f6120e7bf10f26f00024b27a56ca79
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst) {
+  if (src.value().Holder() != dst->value().Holder() ||
+      src.value().data() != dst->value().data()) {
+    dst->set_rows(src.rows());
+    dst->set_height(src.height());
+  }
+  phi::Copy<Context>(
+      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aa848bea2a717ffcda4dff562ec56a702b7dbc5
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80b2a1f6678a27594f0fd3319ccb938dac67bf13
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+namespace sr {
+
+static std::vector<int64_t> PathToRows(const DenseTensor& path) {
+  std::set<int64_t> rows;
+  const int64_t* paths = path.data<int64_t>();
+  for (int64_t i = 0; i < path.numel(); ++i) {
+    int64_t row = paths[i];
+    if (row < 0) {
+      continue;
+    }
+    rows.emplace(row);
+  }
+  return std::vector<int64_t>(rows.begin(), rows.end());
+}
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad) {
+  PADDLE_ENFORCE_NOT_NULL(
+      path.get_ptr(),
+      errors::NotFound("Custom tree must be set for sparse mode!"));
+  paddle::framework::Vector<int64_t> real_rows = PathToRows(*path);
+  w_grad->set_rows(real_rows);
+  // Build a map of id -> row_index to speed up finding the index of one id
+  w_grad->set_height(w.dims()[0]);
+  auto* w_grad_value = w_grad->mutable_value();
+  phi::DDim temp_dim(w.dims());
+  temp_dim[0] = real_rows.size();
+  w_grad_value->Resize(temp_dim);
+  phi::HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                            x,
+                                            w,
+                                            label,
+                                            pre_out,
+                                            out_grad,
+                                            path,
+                                            code,
+                                            bias,
+                                            num_classes,
+                                            remote_prefetch,
+                                            trainer_id,
+                                            height_sections,
+                                            epmap,
+                                            table_names,
+                                            is_sparse,
+                                            x_grad,
+                                            w_grad_value,
+                                            bias_grad,
+                                            w_grad);
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..557c8b1bc5eed2c64f3a5c16d52cead124815ffc
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 9bcd5d8544e2d73961d72115023446d427e8895e..67126d82042b28de8c560a55046e50029153290d 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/shape_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -25,15 +26,7 @@ template <typename T, typename Context>
 void ShapeKernel(const Context& ctx,
                  const SelectedRows& input,
                  DenseTensor* out) {
-  auto in_var = input;
-  phi::DDim in_dims;
-  in_dims = in_var.value().dims();
-  auto out_t = out;
-  out_t->Resize({in_dims.size()});
-  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = in_dims[i];
-  }
+  phi::ShapeKernel<T, Context>(ctx, input.value(), out);
 }
 
 }  // namespace sr
diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a028b0c8dc50fb068de1ded367990c409bd45cb
--- /dev/null
+++ b/paddle/phi/kernels/set_value_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/shape_kernel.cc
similarity index 53%
rename from paddle/phi/kernels/gpu/shape_kernel.cu
rename to paddle/phi/kernels/shape_kernel.cc
index 39b6eaeaef2a8e80d204941dc1f3ac92907aa786..dd26a7edc9cdd8e1917bb5d88e957b3e7d545f93 100644
--- a/paddle/phi/kernels/gpu/shape_kernel.cu
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -13,12 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/shape_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
 
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
@@ -33,3 +64,4 @@ PD_REGISTER_KERNEL(shape,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h
index ca69d652770aacd01191f5c3ca685276f0f2336f..4edd562ca885301b02b8ecc737c8590831e3cac4 100644
--- a/paddle/phi/kernels/softmax_kernel.h
+++ b/paddle/phi/kernels/softmax_kernel.h
@@ -19,20 +19,10 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void SoftmaxKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    int axis,
-                   DataType dtype,
-                   DenseTensor* out) {
-  auto cast_x = phi::Cast<T, Context>(dev_ctx, x, dtype);
-  phi::SoftmaxRawKernel<T, Context>(dev_ctx, axis, out);
-}
+                   DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index f4265d303d730708f7e6db684accff538f604174..23e059c72e77615e2c24aed961d22b3154c30449 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -27,11 +27,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad);
 
@@ -40,11 +41,12 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const SparseCooTensor& x,
                                     const DenseTensor& rulebook,
                                     const DenseTensor& kernel,
-                                    const SparseCooTensor& out_grad,
+                                    const DenseTensor& out_grad,
                                     const std::vector<int>& paddings,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
-                                    const int groups) {
+                                    const int groups,
+                                    const bool subm) {
   DenseTensor x_grad =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   DenseTensor kernel_grad = phi::Empty<Context>(
@@ -59,6 +61,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                dilations,
                                strides,
                                groups,
+                               subm,
                                &x_grad,
                                &kernel_grad);
   std::vector<DenseTensor> out(2);
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index cfb451afdcbcb007bbce468b3e582db5057796d5..ff2cf94edb5a378b5d43d569a869fcd705ef12bd 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -18,105 +18,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
 
 namespace phi {
 namespace sparse {
 
-struct Dims4D {
-  int dims[4];
-  Dims4D(const int batch, const int x, const int y, const int z) {
-    dims[0] = batch;
-    dims[1] = z;
-    dims[2] = y;
-    dims[3] = x;
-  }
-  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
-};
-
-// Judge whether the current position x is in (lower, upper)
-inline HOSTDEVICE bool Check(const int& x,
-                             const int& kx,
-                             const int& pad,
-                             const int& stride,
-                             const int dilation,
-                             const int kdim,
-                             const int xdim) {
-  const int lower = x - dilation * kx + pad;
-  const int uper = x + (kdim - kx - 1) * dilation - pad;
-  return (lower >= 0 && lower % stride == 0 && uper < xdim);
-}
-
-// Check whether the current position(x, y, z) is legal:
-// Judge the minimum and maximum values at each latitude
-inline HOSTDEVICE bool Check(const Dims4D& dims,
-                             const Dims4D& kernel_dims,
-                             const Dims4D& paddings,
-                             const Dims4D& dilations,
-                             const Dims4D& strides,
-                             const int x,
-                             const int y,
-                             const int z,
-                             const int kx,
-                             const int ky,
-                             const int kz) {
-  bool x_valid = Check(
-      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
-  bool y_valid = Check(
-      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
-  bool z_valid = Check(
-      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
-  return (x_valid && y_valid && z_valid);
-}
-
-template <typename Dim>
-inline HOSTDEVICE int PointToIndex(const int& batch,
-                                   const int& x,
-                                   const int& y,
-                                   const int& z,
-                                   const Dim& dims) {
-  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
-         y * dims[3] + x;
-}
-
-template <typename Dim>
-inline HOSTDEVICE void IndexToPoint(
-    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
-  int n = index;
-  *x = n % dims[3];
-  n /= dims[3];
-  *y = n % dims[2];
-  n /= dims[2];
-  *z = n % dims[1];
-  n /= dims[1];
-  *batch = n;
-}
-
-inline void GetOutShape(const DDim& x_dims,
-                        const DDim& kernel_dims,
-                        const std::vector<int>& paddings,
-                        const std::vector<int>& dilations,
-                        const std::vector<int>& strides,
-                        DDim* out_dims) {
-  PADDLE_ENFORCE_EQ(
-      x_dims.size(),
-      5,
-      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
-  PADDLE_ENFORCE_EQ(kernel_dims.size(),
-                    5,
-                    phi::errors::InvalidArgument(
-                        "the shape of kernel should be (D, H, W, C, OC)"));
-
-  // infer out shape
-  (*out_dims)[0] = x_dims[0];
-  (*out_dims)[4] = kernel_dims[4];
-  for (int i = 1; i < 4; i++) {
-    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
-                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
-                         strides[i - 1] +
-                     1;
-  }
-}
-
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
@@ -125,6 +31,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook);
 
@@ -136,14 +43,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
+                       const bool subm,
                        DenseTensor* rulebook) {
   DenseTensor indices = phi::Empty<Context>(
       dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
   DenseTensor values =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
-  Conv3dKernel<T, Context>(
-      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  Conv3dKernel<T, Context>(dev_ctx,
+                           x,
+                           kernel,
+                           paddings,
+                           dilations,
+                           strides,
+                           groups,
+                           subm,
+                           &coo,
+                           rulebook);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index bcb6db407883ff5a0192699d72c360d1b41200ed..93a335e2f1c35700d2bf5ef54400c52ed54f6be2 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -16,8 +16,6 @@ limitations under the License. */
 
 #include <set>
 
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -28,42 +26,59 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // such as: kernel(3, 3, 3), kernel_size = 27
 // counter_per_weight: (kernel_size)
 // TODO(zhangkaihuo): optimize performance with multithreading
 template <typename T, typename Context>
 void ProductRuleBook(const Context& dev_ctx,
                      const SparseCooTensor& x,
-                     const DenseTensor& kernel,
+                     const std::vector<int>& kernel_sizes,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      const DDim& out_dims,
+                     const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
-  const auto& kernel_dims = kernel.dims();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
   int* counter_ptr = counter_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
 
   int rulebook_len = 0;
   // calc the rulebook_len
   const auto& x_dims = x.dims();
   const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_kernel_dims(
+      1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
   const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
   const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
   const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
 
+  std::set<int> hash_in;
+  if (subm) {
+    for (int i = 0; i < non_zero_num; i++) {
+      int batch = indices_ptr[i];
+      int in_z = indices_ptr[i + non_zero_num];
+      int in_y = indices_ptr[i + 2 * non_zero_num];
+      int in_x = indices_ptr[i + 3 * non_zero_num];
+      int index = phi::funcs::sparse::PointToIndex<DDim>(
+          batch, in_x, in_y, in_z, x_dims);
+      hash_in.insert(index);
+    }
+  }
+
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
-    for (int kz = 0; kz < kernel_dims[0]; kz++) {
-      for (int ky = 0; ky < kernel_dims[1]; ky++) {
-        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+    for (int kz = 0; kz < kernel_sizes[0]; kz++) {
+      for (int ky = 0; ky < kernel_sizes[1]; ky++) {
+        for (int kx = 0; kx < kernel_sizes[2]; kx++) {
+          ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
             int in_z = indices_ptr[i + non_zero_num];
@@ -72,31 +87,38 @@ void ProductRuleBook(const Context& dev_ctx,
             int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
             int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
             int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
-            if (Check(c_x_dims,
-                      c_kernel_dims,
-                      c_paddings,
-                      c_dilations,
-                      c_strides,
-                      in_x,
-                      in_y,
-                      in_z,
-                      kx,
-                      ky,
-                      kz)) {
+            if (phi::funcs::sparse::Check(c_x_dims,
+                                          c_kernel_dims,
+                                          c_paddings,
+                                          c_dilations,
+                                          c_strides,
+                                          in_x,
+                                          in_y,
+                                          in_z,
+                                          kx,
+                                          ky,
+                                          kz)) {
+              if (subm) {
+                int out_index = phi::funcs::sparse::PointToIndex<DDim>(
+                    batch, out_x, out_y, out_z, out_dims);
+                if (hash_in.find(out_index) == hash_in.end()) {
+                  continue;
+                }
+              }
+
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index] += 1;
+                counter_ptr[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
-                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index] = kernel_index - 1;
                 rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
                 rulebook_ptr[rulebook_index + rulebook_len * 2] =
-                    PointToIndex<DDim>(
+                    phi::funcs::sparse::PointToIndex<DDim>(
                         batch, out_x, out_y, out_z, out_dims);  // out_index
                 ++rulebook_index;
               }
             }
           }
-          ++kernel_index;
         }
       }
     }
@@ -140,7 +162,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
     const int index = *it;
     int batch, x, y, z;
-    IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
+    phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
     out_indices_ptr[i] = batch;
     out_indices_ptr[i + out_non_zero_num] = z;
     out_indices_ptr[i + out_non_zero_num * 2] = y;
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 6ee265a329673ad456e0dd491a9544143016aff5..3348d81cf6b4bbffe7f6db24dbe12fef24cadf40 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -33,11 +33,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -70,32 +71,57 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* d_kernel_ptr = kernel_grad->data<T>();
   memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
-  Gather<T>(x.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len,
-            rulebook_len,
-            in_channels,
-            in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len * 2,
-            rulebook_len,
-            out_channels,
-            out_grad_features_ptr);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
   for (int i = 0; i < rulebook_len; i++) {
     counter[rulebook_ptr[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -136,10 +162,6 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
   Scatter<T>(d_x_features_ptr,
              rulebook.data<int>() + rulebook_len,
              rulebook_len,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 64ef068e03ab53e4338a0f5ba3d5f160a4e66dd5..f022e4ef4bb63617018d6e6ecdf2560b72dead3a 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -35,6 +33,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -45,10 +44,21 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
-  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
 
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
@@ -58,11 +68,12 @@ void Conv3dKernel(const Context& dev_ctx,
 
   ProductRuleBook<T, Context>(dev_ctx,
                               x,
-                              kernel,
-                              paddings,
+                              kernel_sizes,
+                              subm_paddings,
                               dilations,
-                              strides,
+                              subm_strides,
                               out_dims,
+                              subm,
                               rulebook,
                               &counter_per_kernel);
 
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3010d480b55c9583ff5af9271b2e063667a69da7
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
+
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      for (int c = 0; c < channels; c++) {
+        grad_functor.compute(in_features_ptr[in_i * channels + c],
+                             out_features_ptr[out_i * channels + c],
+                             out_grad_ptr[out_i * channels + c],
+                             1,
+                             &x_grad_ptr[in_i * channels + c]);
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86971242df5aeed5b0acd74f23db185e02544846
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  // 1. product rule book
+  ProductRuleBook<T, Context>(dev_ctx,
+                              x,
+                              real_kernel_sizes,
+                              paddings,
+                              dilations,
+                              strides,
+                              out_dims,
+                              false,
+                              rulebook,
+                              &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T>(
+      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
+
+  int rulebook_len = rulebook->dims()[1];
+  const int* rulebook_ptr = rulebook->data<int>();
+  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  std::vector<int> offsets(kernel_size + 1);
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+  std::vector<bool> out_flags(out->nnz(), false);
+
+  // 2. max pool
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  phi::funcs::MaxPool<T> max_pool_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      if (!out_flags[out_i]) {
+        out_flags[out_i] = true;
+        memcpy(&out_features_ptr[out_i * in_channels],
+               &in_features_ptr[in_i * in_channels],
+               in_channels * sizeof(T));
+      } else {
+        for (int c = 0; c < in_channels; c++) {
+          max_pool_functor.compute(in_features_ptr[in_i * in_channels + c],
+                                   &out_features_ptr[out_i * in_channels + c]);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index ba89135641e0e67daa84cd526d8b389953ef1862..50e95ee0b8a4876a65b8ba7d09fd2d112eac2b30 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
 namespace phi {
 namespace sparse {
@@ -71,7 +71,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
 
   const auto place = dev_ctx.GetPlace();
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 03a6aaa68943d7ea8d0ab7c02561a407166e43d5..5b928817f64d748ec824a2c28e569181034d1072 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,11 +23,15 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
 // this kernel with phi::GatherCUDAKernel;
 // Vectorization can be used to improve read and write bandwidth
@@ -71,7 +75,8 @@ __global__ void ScatterKernel(const T* input,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out) {
+                              T* out,
+                              const bool subm = false) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
     int indices_i = i / channels;
@@ -82,6 +87,9 @@ __global__ void ScatterKernel(const T* input,
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
     T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
       sum += input[out_feature_i * channels + channels_i];
@@ -135,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
   return new_end.first;
 }
 
+template <typename T>
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              T* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+template <typename T>
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  T* out_indices,
+                                  T* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+template <typename T>
+__global__ void ProductRuleBookKernel(const T* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      T* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const std::vector<int>& kernel_sizes,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<int><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               kernel_size * sizeof(int),
+                               dev_ctx.stream()>>>(indices_ptr,
+                                                   d_x_dims,
+                                                   d_kernel_dims,
+                                                   d_out_dims,
+                                                   non_zero_num,
+                                                   d_paddings,
+                                                   d_dilations,
+                                                   d_strides,
+                                                   subm,
+                                                   rulebook_ptr,
+                                                   counter_ptr,
+                                                   in_indexs.data<int>());
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+
+  DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x,
+                                         config.thread_per_block,
+                                         kernel_size * sizeof(int),
+                                         dev_ctx.stream()>>>(
+        val_result.data<int>(),
+        len,
+        rulebook_len,
+        kernel_size,
+        rulebook_ptr,
+        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<int><<<1, 1>>>(
+      unique_key_ptr,
+      new_end,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<int><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(unique_key_ptr,
+                                               unique_value_ptr,
+                                               out_index_ptr,
+                                               out_non_zero_num,
+                                               rulebook_len,
+                                               d_out_dims,
+                                               out_indices_ptr,
+                                               rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 861f18f36e632c88147b38e8a9203384050293bb..4db0a0b0011b5a664b66d54f6d42f2e1954ccd12 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -38,11 +38,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -69,37 +70,18 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* in_features_ptr = in_features.data<T>();
   T* d_x_features_ptr = d_x_features.data<T>();
   T* out_grad_features_ptr = out_grad_features.data<T>();
-  kernel_grad->Resize(kernel_dims);
-  dev_ctx.Alloc(
-      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  kernel_grad->ResizeAndAllocate(kernel_dims);
   T* d_kernel_ptr = kernel_grad->data<T>();
   phi::funcs::SetConstant<Context, T> set_zero;
   set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                             rulebook_ptr + rulebook_len,
-                                             in_features_ptr,
-                                             rulebook_len,
-                                             in_channels);
-
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
       h_counter(rulebook_len, 0);
   phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
@@ -117,16 +99,56 @@ void Conv3dGradKernel(const Context& dev_ctx,
   for (int i = 0; i < rulebook_len; i++) {
     counter[h_counter[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(out_grad.data<T>(),
+                                             rulebook_ptr + rulebook_len * 2,
+                                             out_grad_features_ptr,
+                                             rulebook_len,
+                                             out_channels);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -167,19 +189,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-
-  DenseTensor out_index = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
   SortedAndUniqueIndex(dev_ctx,
                        rulebook_ptr + rulebook_len,
@@ -200,7 +214,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                                          x.nnz(),
                                          rulebook_len,
                                          in_channels,
-                                         x_grad_values_ptr);
+                                         x_grad_values_ptr,
+                                         subm);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4a533d9d1d5e8f6090976ab63a86b01c1d518c8d..214e689e9370a313e66be0281db177407d7b87f0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -12,345 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/execution_policy.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
 namespace sparse {
 
-/**
- * @brief: update the out index and indices
- * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
- * out_indexs: indicates the position of the output index in the rulebook
- * rulebook_len: indicates the length of rulebook
- * out_dims: indicates the output dims
- * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
- * rulebook_out_indexs: the output index in rulebook
-**/
-__global__ void UpdateIndexKernel(const int* unique_keys,
-                                  const int* unique_values,
-                                  const int* out_indexs,
-                                  const int non_zero_num,
-                                  const int rulebook_len,
-                                  const Dims4D out_dims,
-                                  int* out_indices,
-                                  int* rulebook_out_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    const int index = unique_keys[i];
-    int batch, x, y, z;
-    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    out_indices[i + non_zero_num] = z;
-    out_indices[i + non_zero_num * 2] = y;
-    out_indices[i + non_zero_num * 3] = x;
-
-    // update rulebook
-    int start = unique_values[i];
-    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
-    // max(end-start) = kernel_size
-    for (int j = start; j < end; j++) {
-      rulebook_out_indexs[out_indexs[j]] = i;
-    }
-  }
-}
-
-/**
- * @brief product rulebook
- * for input_i in x_indices:
- *   if input_i participate in the convolution calculation:
- *       infer the output_i by input_i and kernel_i
- *       save output_i
- *
- * x_indices: the indices of input features
- * x_dims: the input dims
- * kernel_dims: the kernel dims
- * out_dims: the output dims
- * non_zero_num: the number of input features
- * rulebook: the rulebook to save the kernel index, input index and output index
- * counter: save the number of times each location in the kernel participates in
- *the caculation
-**/
-__global__ void ProductRuleBookKernel(const int* x_indices,
-                                      const Dims4D x_dims,
-                                      const Dims4D kernel_dims,
-                                      const Dims4D out_dims,
-                                      const int64_t non_zero_num,
-                                      const Dims4D paddings,
-                                      const Dims4D dilations,
-                                      const Dims4D strides,
-                                      int* rulebook,
-                                      int* counter) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int counter_buf[];  // kernel_size
-  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
-  const int offset = kernel_size * non_zero_num;
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    counter_buf[i] = 0;
-  }
-  __syncthreads();
-
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    int kernel_index = 0;
-    for (int kz = 0; kz < kernel_dims[1]; kz++) {
-      for (int ky = 0; ky < kernel_dims[2]; ky++) {
-        for (int kx = 0; kx < kernel_dims[3]; kx++) {
-          int batch = x_indices[i];
-          int in_z = x_indices[i + non_zero_num];
-          int in_y = x_indices[i + 2 * non_zero_num];
-          int in_x = x_indices[i + 3 * non_zero_num];
-          int in_i = -1, out_index = -1, kernel_i = -1;
-          if (Check(x_dims,
-                    kernel_dims,
-                    paddings,
-                    dilations,
-                    strides,
-                    in_x,
-                    in_y,
-                    in_z,
-                    kx,
-                    ky,
-                    kz)) {
-            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
-            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
-            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
-            in_i = i;
-            out_index =
-                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
-            atomicAdd(&counter_buf[kernel_index], 1);
-            kernel_i = kernel_index;
-          }
-          rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
-          ++kernel_index;
-        }
-      }
-    }
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicAdd(&counter[i], counter_buf[i]);
-  }
-}
-
-// brief: calculation the distance between start and end
-__global__ void DistanceKernel(const int* start,
-                               const int* end,
-                               int* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-
-// the basic algorithm can refer to convolution_kernel.cc or
-// the second paper
-// example:
-// 1. the rulebook:
-//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
-//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
-// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
-// 3. sorted the (key, value)
-// 4. unique the (key, value):
-//  unique_key:     20, 25, 30, 33
-//  unique_values:  0, 2, 3, 5
-//  the index of unique_values is: 0, 1, 2, 3
-// 5. update the out_index by unique_key, uniqe_value and the index of
-// unique_value:
-//  the new out_index: 0, 2, 3, 2, 3, 0, 1
-template <typename T, typename Context>
-int ProductRuleBook(const Context& dev_ctx,
-                    const SparseCooTensor& x,
-                    const DenseTensor& kernel,
-                    const std::vector<int>& paddings,
-                    const std::vector<int>& dilations,
-                    const std::vector<int>& strides,
-                    const DDim& out_dims,
-                    DenseTensor* rulebook,
-                    DenseTensor* counter_per_kernel,
-                    DenseTensor* offsets_per_kernel,
-                    DenseTensor* out_index,
-                    DenseTensor* unique_key,
-                    DenseTensor* unique_value,
-                    SparseCooTensor* out,
-                    std::vector<int>* h_counter,
-                    std::vector<int>* h_offsets) {
-  const auto& kernel_dims = kernel.dims();
-  const int64_t non_zero_num = x.nnz();
-  const auto& non_zero_indices = x.non_zero_indices();
-  const int* indices_ptr = non_zero_indices.data<int>();
-  int* counter_ptr = counter_per_kernel->data<int>();
-  int* offsets_ptr = offsets_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  const int rulebook_rows = 3;
-  const int rulebook_cols = kernel_size * non_zero_num;
-  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
-  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
-  int* rulebook_ptr = rulebook->data<int>();
-
-  const auto x_dims = x.dims();
-  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
-  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
-  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
-  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
-  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
-
-  // 1. product rule book
-  phi::funcs::SetConstant<Context, int> set_zero;
-  set_zero(dev_ctx, counter_per_kernel, 0);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-
-  ProductRuleBookKernel<<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          kernel_size * sizeof(int),
-                          dev_ctx.stream()>>>(indices_ptr,
-                                              d_x_dims,
-                                              d_kernel_dims,
-                                              d_out_dims,
-                                              non_zero_num,
-                                              d_paddings,
-                                              d_dilations,
-                                              d_strides,
-                                              rulebook_ptr,
-                                              counter_ptr);
-
-// 2. remove -1
-#ifdef PADDLE_WITH_HIP
-  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                             rulebook_ptr,
-                             rulebook_ptr + rulebook_rows * rulebook_cols,
-                             -1);
-
-#ifdef PADDLE_WITH_HIP
-  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                         counter_ptr,
-                         counter_ptr + kernel_size,
-                         offsets_ptr);
-
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-  int rulebook_len =
-      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
-  rulebook->Resize({rulebook_rows, rulebook_len});
-
-  // 3. sorted or merge the out index
-  out_index->ResizeAndAllocate({rulebook_len});
-  unique_value->ResizeAndAllocate({rulebook_len});
-  unique_key->ResizeAndAllocate({rulebook_len});
-  dev_ctx.Alloc(
-      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
-  int* out_index_ptr = out_index->data<int>();
-  dev_ctx.Alloc(
-      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
-  int* unique_value_ptr = unique_value->data<int>();
-  dev_ctx.Alloc(
-      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
-  int* unique_key_ptr = unique_key->data<int>();
-
-  int* new_end = SortedAndUniqueIndex(dev_ctx,
-                                      rulebook_ptr + 2 * rulebook_len,
-                                      rulebook_len,
-                                      out_index,
-                                      unique_key,
-                                      unique_value);
-  // thrust::distance doesn't support stream parameters
-  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-  // new_end.first);
-  DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end,
-                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-  int out_non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      hipMemcpyDeviceToHost,
-      dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      cudaMemcpyDeviceToHost,
-      dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-
-  // 5. update out_indices and rulebook by unique_value_ptr
-  const int64_t sparse_dim = 4;
-  DenseTensorMeta indices_meta(
-      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
-  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-
-  int* out_indices_ptr = out_indices.data<int>();
-
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-  UpdateIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(unique_key_ptr,
-                                          unique_value_ptr,
-                                          out_index_ptr,
-                                          out_non_zero_num,
-                                          rulebook_len,
-                                          d_out_dims,
-                                          out_indices_ptr,
-                                          rulebook_ptr + 2 * rulebook_len);
-  out->SetMember(out_indices, out_values, out_dims, true);
-  return rulebook_len;
-}
-
 /**
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
@@ -364,6 +35,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -374,8 +46,12 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
-  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
-  out->set_dims(out_dims);
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
   std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
@@ -389,20 +65,25 @@ void Conv3dKernel(const Context& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
   DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensor out_index = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
-                                      kernel,
-                                      paddings,
+                                      kernel_sizes,
+                                      subm_paddings,
                                       dilations,
-                                      strides,
+                                      subm_strides,
                                       out_dims,
+                                      subm,
                                       rulebook,
                                       &counter_per_kernel,
                                       &offsets_per_kernel,
@@ -428,6 +109,8 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(out_features_meta));
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1048dd1be0c01c1fa40a8fb2bcab4dca01837d3c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
+                                      const T* out_features_ptr,
+                                      const T* out_grad_ptr,
+                                      const int* rulebook_ptr,
+                                      const int n,
+                                      const int rulebook_len,
+                                      const int channels,
+                                      T* x_grad_ptr) {
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int c = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    grad_functor.compute(in_features_ptr[in_i * channels + c],
+                         out_features_ptr[out_i * channels + c],
+                         out_grad_ptr[out_i * channels + c],
+                         1,
+                         &x_grad_ptr[in_i * channels + c]);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int in_channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(kernel_size);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolGradCudaKernel<T><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
+        in_features_ptr,
+        out_features_ptr,
+        out_grad_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        x_grad_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0f6a0d13b1ddbd375a90808789a61e0cb045a7c9
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -0,0 +1,140 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
+                                  const int* rulebook_ptr,
+                                  const int n,
+                                  const int rulebook_len,
+                                  const int channels,
+                                  T* out_features_ptr) {
+  phi::funcs::MaxPool<T> max_pool_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int channel_i = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
+                             &out_features_ptr[out_i * channels + channel_i]);
+  }
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  // 1. product rulebook
+  int rulebook_len = ProductRuleBook<T, Context>(dev_ctx,
+                                                 x,
+                                                 real_kernel_sizes,
+                                                 paddings,
+                                                 dilations,
+                                                 strides,
+                                                 out_dims,
+                                                 false,
+                                                 rulebook,
+                                                 &counter_per_kernel,
+                                                 &offsets_per_kernel,
+                                                 &out_index,
+                                                 &unique_key,
+                                                 &unique_value,
+                                                 out,
+                                                 &counter,
+                                                 &offsets);
+
+  const int* rulebook_ptr = rulebook->data<int>();
+
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+// 2. max pool
+#ifdef PADDLE_WITH_HIP
+  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+               out_features_ptr,
+               out_features_ptr + out->non_zero_elements().numel(),
+               static_cast<T>(-FLT_MAX));
+  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(
+        in_features_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        out_features_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 2e741111fb1489aef5bdc51de637b77eec9d28a7..8048180e425ead98e6db15514caf38c406a2aebf 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <thrust/remove.h>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
@@ -115,14 +117,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
 
   auto temp_indexs_meta =
       phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW);
   DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta));
   int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
-  GetNonZeroNums<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  GetNonZeroNums<<<config.block_per_grid.x,
+                   config.thread_per_block.x,
+                   0,
+                   dev_ctx.stream()>>>(
       x_data, rows, cols, nums_ptr, temp_indexs_ptr);
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
@@ -167,7 +171,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
 
   dev_ctx.Wait();  // wait the copy
 
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
@@ -184,16 +189,18 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   T* sparse_data = values.mutable_data<T>(place);
 
   // 3. calc indices by indexs and get values by indexs
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
-  GetNonZeroElementsAndIndices<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      x_data,
-      sparse_dim,
-      cols,
-      d_x_dims.data<int64_t>(),
-      non_zero_num,
-      temp_indexs_ptr,
-      indices_data,
-      sparse_data);
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(x_data,
+                                                     sparse_dim,
+                                                     cols,
+                                                     d_x_dims.data<int64_t>(),
+                                                     non_zero_num,
+                                                     temp_indexs_ptr,
+                                                     indices_data,
+                                                     sparse_data);
   out->SetMember(indices, values, x_dims, true);
 }
 
@@ -263,10 +270,9 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
   int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
   T* coo_values_data = values.mutable_data<T>(place);
 
-  int grid_size = 1, block_size = 1;
   if (batchs > 1) {
-    GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
-    GetBatchSizes<<<grid_size, block_size>>>(
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
+    GetBatchSizes<<<config.block_per_grid.x, config.thread_per_block.x>>>(
         csr_crows_data, rows, batchs, offsets_ptr);
 
 #ifdef PADDLE_WITH_HIP
@@ -279,9 +285,10 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
                            offsets_ptr);
   }
 
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
-  dim3 grids(grid_size, batchs, 1);
-  ConvertCsrCrowsToCooRows<<<grids, block_size>>>(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  config.block_per_grid.y = batchs;
+  ConvertCsrCrowsToCooRows<<<config.block_per_grid,
+                             config.thread_per_block.x>>>(
       csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
 
 #ifdef PADDLE_WITH_HIP
@@ -404,21 +411,29 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
     // TODO(zhangkahuo): call coalesced() to distinct and sort the indices
   }
 
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
   if (batchs > 1) {
     DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW);
     phi::DenseTensor batchs_offset(
         phi::make_intrusive<paddle::experimental::SharedStorage>(place),
         std::move(batchs_meta));
     int64_t* batchs_offset_ptr = batchs_offset.mutable_data<int64_t>(place);
-    GetBatchsOffset<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    GetBatchsOffset<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
         batchs_ptr, non_zero_num, batchs_offset_ptr);
-    dim3 grids(grid_size, batchs, 1);
-    ConvertCooRowsToCsrCrows<<<grids, block_size, 0, dev_ctx.stream()>>>(
+    config.block_per_grid.y = batchs;
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   } else {
-    ConvertCooRowsToCsrCrows<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   }
 
@@ -522,12 +537,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<
-      T,
-      int64_t><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  KernelSparseCooToDense<T, int64_t><<<config.block_per_grid.x,
+                                       config.thread_per_block.x,
+                                       0,
+                                       dev_ctx.stream()>>>(
       indices.data<int64_t>(),
       d_sparse_offsets.data<int64_t>(),
       x_data,
diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..572ade76281bc0e6af6be48ed8cc1a96751412ed
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad);
+
+template <typename T, typename Context>
+DenseTensor MaxPoolGrad(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& rulebook,
+                        const SparseCooTensor& out,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& kernel_sizes) {
+  DenseTensor x_grad = phi::Empty<Context>(
+      dev_ctx,
+      DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout()));
+  MaxPoolGradKernel<T, Context>(
+      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+  return x_grad;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfadbf72e300fd633e8475475442658a7db20ad9
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor MaxPool(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DenseTensor* rulebook) {
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  SparseCooTensor coo(indices, values, x.dims());
+  MaxPoolKernel<T, Context>(
+      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index c83b2130ed4550540a98148aec26e42332c8060d..da05eb3d3cf7682e376efe59aaea8d09d1b6c757 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -23,37 +23,6 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
-inline const DDim InferDenseDims(const DDim& x_dims,
-                                 const int64_t sparse_dim,
-                                 const int64_t non_zero_num) {
-  auto dense_dim = x_dims.size() - sparse_dim;
-  DDim values_dims;
-  if (dense_dim) {
-    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
-    dense_dim_vec[0] = non_zero_num;
-    memcpy(&dense_dim_vec[1],
-           x_dims.Get() + sparse_dim,
-           dense_dim * sizeof(x_dims[0]));
-    values_dims = phi::make_ddim(dense_dim_vec);
-  } else {
-    values_dims = phi::make_ddim({non_zero_num});
-  }
-  return values_dims;
-}
-
-template <typename Context>
-inline void GetGpuLaunchConfig1D(const Context& dev_ctx,
-                                 const int64_t n,
-                                 int* grid_size,
-                                 int* block_size) {
-  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
-                                     : (1 << static_cast<int>(std::log2(n)));
-  *grid_size = n / *block_size;
-  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
-}
-
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..10faf5c48d5bffab9f5199ebeefe7d5a2267ecea
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4daa84e25c373d6bd5a26f2682385921dc2ce880
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 396830ca20765bc24d9ddc0e9d09ef045d376dfc..7ae0dc45c5e1be09a31821c171b84fbb47fe1c9e 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -16,40 +16,124 @@ limitations under the License. */
 
 namespace phi {
 
-#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                          \
-      const ArgumentMappingContext& ctx) {                                   \
-    return KernelSignature(                                                  \
-        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(               \
+      const ArgumentMappingContext& ctx) {                        \
+    return KernelSignature(op_name "_grad",                       \
+                           {"X", GradVarName("Out")},             \
+                           {attrs},                               \
+                           {GradVarName("X")});                   \
   }
 
-#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                            \
-      const ArgumentMappingContext& ctx) {                                     \
-    return KernelSignature(                                                    \
-        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(                 \
+      const ArgumentMappingContext& ctx) {                          \
+    return KernelSignature(op_name "_grad",                         \
+                           {"Out", GradVarName("Out")},             \
+                           {attrs},                                 \
+                           {GradVarName("X")});                     \
   }
 
+#define comma ,
+
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
+                               "thresholded_relu",
+                               "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
+
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
+                                 "hard_sigmoid",
+                                 "slope" comma "offset");  // NOLINT
+
 KernelSignature ReluDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
 }
 
-DefineActGradDepXOpArgMap(Cos, "cos");
-DefineActGradDepXOpArgMap(Tan, "tan");
-DefineActGradDepXOpArgMap(Acos, "acos");
-DefineActGradDepXOpArgMap(Sin, "sin");
-DefineActGradDepXOpArgMap(Asin, "asin");
-DefineActGradDepXOpArgMap(Atan, "atan");
-DefineActGradDepXOpArgMap(Sinh, "sinh");
-DefineActGradDepXOpArgMap(Cosh, "cosh");
-DefineActGradDepXOpArgMap(Asinh, "asinh");
-DefineActGradDepXOpArgMap(Acosh, "acosh");
-DefineActGradDepXOpArgMap(Atanh, "atanh");
-DefineActGradDepOutOpArgMap(Relu, "relu");
+KernelSignature TanhDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature TanhTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tanh_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature SigmoidDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature SigmoidTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature LeakyReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"});
+}
+
+KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"alpha"},
+                         {GradVarName("X")});
+}
+
+KernelSignature EluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -65,3 +149,35 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                            phi::ReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad,
+                           phi::TanhDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad,
+                           phi::TanhTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad,
+                           phi::LeakyReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
+                           phi::LeakyReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
+                           phi::ThresholdedReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
+                           phi::SoftShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
+                           phi::HardShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad,
+                           phi::TanhShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad,
+                           phi::SigmoidDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad,
+                           phi::SigmoidTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
+                           phi::LogSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
+                           phi::HardSigmoidGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/allclose_sig.cc b/paddle/phi/ops/compat/allclose_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5c4fc027b54225cfdbcc67498eed18789922bd3
--- /dev/null
+++ b/paddle/phi/ops/compat/allclose_sig.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AllCloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllCloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d149e8e6a9aa04d3cc8d02e370e7e07e3cbebeb0
--- /dev/null
+++ b/paddle/phi/ops/compat/assign_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("X")) {
+    if (ctx.IsDenseTensorVectorInput("X")) {
+      return KernelSignature("assign_array", {"X"}, {}, {"Out"});
+    } else if (ctx.IsSelectedRowsInput("X")) {
+      return KernelSignature("assign_sr", {"X"}, {}, {"Out"});
+    } else {
+      return KernelSignature("assign", {"X"}, {}, {"Out"});
+    }
+  } else {
+    return KernelSignature("assign", {"X"}, {}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01084e764ed9e41ffb1e67cda26051f5a61fdeeb
--- /dev/null
+++ b/paddle/phi/ops/compat/cumprod_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CumprodGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cumprod_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2a21673634c30988c64e74ffdb1f489a2392f63
--- /dev/null
+++ b/paddle/phi/ops/compat/deformable_conv_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeformableConvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("deformable_conv",
+                         {"Input", "Offset", "Filter", "Mask"},
+                         {"strides",
+                          "paddings",
+                          "dilations",
+                          "deformable_groups",
+                          "groups",
+                          "im2col_step"},
+                         {"Output"});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(deformable_conv,
+                           phi::DeformableConvOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bcd30ec5d79b9e137c3dc3fa38f0498e9fe01de
--- /dev/null
+++ b/paddle/phi/ops/compat/determinant_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeterminantGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("determinant_grad",
+                         {"Input", "Out", GradVarName("Out")},
+                         {},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(determinant_grad,
+                           phi::DeterminantGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc
index 0a14b9095c8343f47e1d6aa039c9aced963984ce..f3245b922c0d913a87b58f813bd0ca142ecb6287 100644
--- a/paddle/phi/ops/compat/diag_sig.cc
+++ b/paddle/phi/ops/compat/diag_sig.cc
@@ -20,8 +20,15 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
 }
 
+KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
+PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index fc890fa3a4923aaf452af20fd586c82d506ea1a7..1d2aaa04f05d205483dbda5c738c7499ad068881 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -114,6 +114,14 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
                          {GradVarName("X"), GradVarName("Y")});
 }
 
+KernelSignature ElementwiseFMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmin_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
 KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("divide_double_grad",
@@ -130,6 +138,14 @@ KernelSignature ElementwiseMulGradOpArgumentMapping(
                          {GradVarName("X"), GradVarName("Y")});
 }
 
+KernelSignature ElementwiseFMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmax_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
 KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("multiply_double_grad",
@@ -192,3 +208,9 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
                            phi::ElementwiseMulDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
                            phi::ElementwiseMulTripleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
+                           phi::ElementwiseFMaxGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
+                           phi::ElementwiseFMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c47bbe48b8ee18527cfef41fad3488bef6c1dd9
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_sig.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"});
+  } else {
+    return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"});
+  }
+}
+
+KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"Axis", "overwrite"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"axis", "overwrite"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf4b47bcf5fa9c1fb9d03f6b332c0c867211f5ac
--- /dev/null
+++ b/paddle/phi/ops/compat/gelu_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"});
+}
+
+KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu_grad",
+                         {"X", GradVarName("Out")},
+                         {"approximate"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b76a9770d4dede5ea604f69858201c2fb035070d
--- /dev/null
+++ b/paddle/phi/ops/compat/grid_sampler_sig.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GridSamplerOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample",
+                         {"X", "Grid"},
+                         {"mode", "padding_mode", "align_corners"},
+                         {"Output"});
+}
+
+KernelSignature GridSamplerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample_grad",
+                         {"X", "Grid", GradVarName("Output")},
+                         {"mode", "padding_mode", "align_corners"},
+                         {GradVarName("X"), GradVarName("Grid")});
+}
+
+}  // namespace phi
+
+// use Python API name as kernel name
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample);
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad,
+                           phi::GridSamplerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20183d1a9b06634c38f9aa57a31cd58363e0095b
--- /dev/null
+++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HierarchicalSigmoidOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("hierarchical_sigmoid",
+                         {"X", "W", "Label", "PathTable", "PathCode", "Bias"},
+                         {"num_classes",
+                          "remote_prefetch",
+                          "trainer_id",
+                          "height_sections",
+                          "epmap",
+                          "table_names",
+                          "is_sparse"},
+                         {"Out", "PreOut", "W_Out"});
+}
+
+KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad_sr",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid,
+                           phi::HierarchicalSigmoidOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad,
+                           phi::HierarchicalSigmoidGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/phi/ops/compat/index_select_sig.cc
similarity index 50%
rename from paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
rename to paddle/phi/ops/compat/index_select_sig.cc
index a578c9f7d81083c533028b9c8912a24006ed0292..53eff1bbcd7ed5269299ccfe41631a699e3d0a32 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/phi/ops/compat/index_select_sig.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/phi/core/compat/op_utils.h"
 
-template <typename T>
-using CUDAReduceMeanGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
-                        CUDAReduceMeanGradKernel<paddle::platform::float16>,
-                        CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>);
+KernelSignature IndexSelectGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_select_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_select_grad,
+                           phi::IndexSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08632e990958dd5e19d46c7c5a0ba093c10e65f1
--- /dev/null
+++ b/paddle/phi/ops/compat/isclose_sig.cc
@@ -0,0 +1,50 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22d2f074e9f13c7ba65c6bcbb4b5542881d4128c
--- /dev/null
+++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KLDivLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kldiv_loss_grad",
+                         {"X", "Target", GradVarName("Loss")},
+                         {"reduction"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad,
+                           phi::KLDivLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06b6545f58e7c12964f82fd8b6199270c519c16a
--- /dev/null
+++ b/paddle/phi/ops/compat/kron_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("kron_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e59e9de1e43822ed8d50b8c1d1888e0d1d14540f
--- /dev/null
+++ b/paddle/phi/ops/compat/kthvalue_sig.cc
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KthvalueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kthvalue_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17a81e9ec012f2c116762ff2d653bb96f0e1c4f4
--- /dev/null
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("layer_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "begin_norm_axis", "is_test"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature LayerNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "layer_norm_grad",
+      {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")},
+      {"epsilon", "begin_norm_axis", "is_test"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
+                           phi::LayerNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..968ad4923ba7b4410f0643335c275059e6ea7bea
--- /dev/null
+++ b/paddle/phi/ops/compat/lgamma_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1ecc6d56768f069c208a0230722929200f1dfe0
--- /dev/null
+++ b/paddle/phi/ops/compat/log_softmax_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogSoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad,
+                           phi::LogSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40dc29579b40194f57911df0bfb426de4369d9b3
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_rank_sig.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+// we have to return every specific KernelSignature for infrt now
+KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("TolTensor")) {
+    return KernelSignature("matrix_rank_tol",
+                           {"X", "TolTensor"},
+                           {"use_default_tol", "hermitian"},
+                           {"Out"});
+  } else {
+    return KernelSignature("matrix_rank",
+                           {"X"},
+                           {
+                               "tol", "use_default_tol", "hermitian",
+                           },
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_rank, phi::MatrixRankOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20994c08aa73c33328568e334d258c44eef68171
--- /dev/null
+++ b/paddle/phi/ops/compat/mode_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
+}
+
+KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mode_grad",
+                         {"X", "Indices", GradVarName("Out")},
+                         {"axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dab4655d172312a7389d0bb243e31ee39ef5981
--- /dev/null
+++ b/paddle/phi/ops/compat/multiplex_sig.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"});
+}
+
+KernelSignature MultiplexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..655969093c889aa32ae780f1de3c9c7c81a78eb1
--- /dev/null
+++ b/paddle/phi/ops/compat/one_hot_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("depth_tensor")) {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth_tensor", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  } else {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot);
+
+PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..390d3db5e785ba7642213e9b7a8db2b718ff19f0
--- /dev/null
+++ b/paddle/phi/ops/compat/pool_sig.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature Pool2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_double_grad",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature MaxPool2dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool2dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature MaxPool3dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool3dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad,
+                           phi::Pool2dDoubleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index,
+                           phi::MaxPool2dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad,
+                           phi::MaxPool2dWithIndexGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index,
+                           phi::MaxPool3dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad,
+                           phi::MaxPool3dWithIndexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd296c5e95318332523a3cf07e85f1afd6f8a95c
--- /dev/null
+++ b/paddle/phi/ops/compat/prelu_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("prelu_grad",
+                         {"X", "Alpha", GradVarName("Out")},
+                         {"mode", "data_format"},
+                         {GradVarName("X"), GradVarName("Alpha")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d694d9a7759d9e3cdf0c385164a367260f2a020
--- /dev/null
+++ b/paddle/phi/ops/compat/psroi_pool_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool",
+      {"X", "ROIs", "RoisNum"},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {"Out"});
+}
+
+KernelSignature PsroiPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool_grad",
+      {"X", "ROIs", "RoisNum", GradVarName("Out")},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad,
+                           phi::PsroiPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/qr_sig.cc b/paddle/phi/ops/compat/qr_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd424d590ee113adfab0e9643c3c7ffc519f86e4
--- /dev/null
+++ b/paddle/phi/ops/compat/qr_sig.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 997f1505bd08d991aa3f13f1ad831c0107664b2f..4bca0523801c1a94f90197c93cc495c2c4f56eeb 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -41,8 +41,7 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "mean_raw" KernelSignature
+    // the "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -53,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 
 KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
@@ -63,8 +73,7 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "max_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "max_raw" KernelSignature
+    // the "max_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -74,6 +83,50 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "min_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "min_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "any_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "any_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 KernelSignature ReduceSumGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -83,16 +136,73 @@ KernelSignature ReduceSumGradOpArgumentMapping(
       {GradVarName("X")});
 }
 
+KernelSignature ReduceMeanGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mean_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "min_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceProdGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "prod_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
+
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
+
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
                            phi::ReduceSumGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad,
+                           phi::ReduceMeanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad,
+                           phi::ReduceProdGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad,
+                           phi::ReduceMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad,
+                           phi::ReduceMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1717ec8f788091fc5eae59c40a32a30c355760e8
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {"Out"});
+}
+
+KernelSignature RoiAlignGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align_grad",
+                         {"X", "ROIs", "RoisNum", GradVarName("Out")},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d04c645f183c6e1ac91e4bf6003427008a24fe42
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_pool_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {"Out", "Argmax"});
+}
+
+KernelSignature RoiPoolOpGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool_grad",
+                         {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a144f0e8e8a90eee0bf0a8a80455b1e19611880c
--- /dev/null
+++ b/paddle/phi/ops/compat/roll_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("ShiftsTensor")) {
+    return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
+  }
+  return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
+}
+
+KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roll_grad",
+                         {"X", GradVarName("Out")},
+                         {"shifts", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
index eacfff26d53cf1ea73c33e4c603253c58be60222..5feff54b028ba437125d65e4a6709254704164d8 100644
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -19,9 +19,9 @@ namespace phi {
 
 KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Input")) {
-    if (ctx.HasInput("StartsTensorList")) {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("StartsTensorList") > 0) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
         }
       }
     } else {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -731,6 +731,108 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
+
+KernelSignature SetValueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("StartsTensorList") > 0) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  } else {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(set_value_grad, phi::SetValueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
index 49a6d02225d931f1dc2d3324cb13c2c620f5dfe6..ca3fa5fe1f86ac13252c04c05c0508c47feded42 100644
--- a/paddle/phi/ops/compat/tile_sig.cc
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.HasInput("RepeatTimes")) {
     return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
   } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    const auto& repeat_times =
+        paddle::any_cast<std::vector<int>>(ctx.Attr("repeat_times"));
+    if (!ctx.IsRuntime() && !repeat_times.empty()) {
+      return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+    }
     return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
   } else {
     return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f79f8650decfc6556287be2caefa6d1074ecf7f
--- /dev/null
+++ b/paddle/phi/ops/compat/tril_triu_sig.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"});
+}
+
+KernelSignature TrilTriuGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu_grad",
+                         {GradVarName("Out")},
+                         {"diagonal", "lower"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 76cb01d8a8b98b070d89ba4a3887275f00e228f3..7c4aa164259071667e3d90994759c05454f407ff 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -78,7 +78,7 @@ void TestConv3dBase(const std::vector<int>& indices,
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
     auto outs = paddle::experimental::sparse::conv3d(
-        x, weight, paddings, dilations, strides, 1);
+        x, weight, paddings, dilations, strides, 1, false);
 
     auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
         std::get<0>(outs).impl());
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 8595782be35ab677ef40eb31b8a09237e90f359a..da66334ced78ac92b85bc2effc749888ec3da4ad 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -53,8 +53,7 @@ TEST(API, to_sparse_coo) {
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_coo(
-      x, phi::Backend::CPU, sparse_dim);
+  auto out = paddle::experimental::sparse::to_sparse_coo(x, sparse_dim);
   auto coo = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo->nnz(), non_zero_num);
   int cmp_indices = memcmp(coo->non_zero_indices().data<int64_t>(),
@@ -91,8 +90,7 @@ TEST(API, to_sparse_coo) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_sparse_coo(
-      csr_x, phi::Backend::CPU, sparse_dim);
+  auto out2 = paddle::experimental::sparse::to_sparse_coo(csr_x, sparse_dim);
 
   auto coo2 = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo2->nnz(), non_zero_num);
@@ -132,7 +130,7 @@ TEST(API, to_sparse_csr) {
 
   // 1. test dense_to_sparse_csr
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_csr(x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_sparse_csr(x);
   auto csr = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   auto check = [&](const phi::SparseCsrTensor& csr) {
     ASSERT_EQ(csr.non_zero_cols().numel(), non_zero_num);
@@ -170,8 +168,7 @@ TEST(API, to_sparse_csr) {
   auto coo =
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
   paddle::experimental::Tensor coo_x(coo);
-  auto out2 =
-      paddle::experimental::sparse::to_sparse_csr(coo_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_sparse_csr(coo_x);
 
   auto csr2 = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   check(*csr2);
@@ -212,7 +209,7 @@ TEST(API, to_dense) {
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
 
   paddle::experimental::Tensor coo_x(coo);
-  auto out = paddle::experimental::sparse::to_dense(coo_x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_dense(coo_x);
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp1 =
       memcmp(dense_out->data<float>(), &dense_data[0][0], 9 * sizeof(float));
@@ -237,7 +234,7 @@ TEST(API, to_dense) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_dense(csr_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_dense(csr_x);
 
   auto dense_out2 = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp2 =
diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc
index f4288c2aa2f9418eeff489aa53fe685aa4a155ec..399112d09c2ad55364b5035e7b759b53d0abaea8 100644
--- a/paddle/phi/tests/core/test_meta_fn_utils.cc
+++ b/paddle/phi/tests/core/test_meta_fn_utils.cc
@@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) {
   phi::InferMetaContext ctx;
   ctx.EmplaceBackInput(shared_meat_x);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("sign")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) {
   ctx.EmplaceBackAttr(Backend::CPU);
   ctx.EmplaceBackAttr(false);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("copy_to")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) {
   ctx.EmplaceBackAttr(num_or_sections);
   ctx.EmplaceBackAttr(axis);
   ctx.EmplaceBackOutputs(out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("split")(&ctx);
 
   ASSERT_EQ(dense_out1.dims().size(), 2);
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 317dcce92c8edd1bb76b080cdb578d37eb8b1f58..3897c182e481ce3ae81c406c35e138adf2f7071f 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils)
 
 cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
 if(WITH_GPU)
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index d69c7b2174f726d5757ea707678ddb383cf19d68..460d85f83133f9ecef83daa4e6a446e53485cd0e 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -61,6 +61,10 @@ TEST(DEV_API, copy) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 3e5f96507415624750eb297953719f397e294230..9552c02976f30d11601967034815545f94ff1f97 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index dc283728ee5f761e79c9c396d63121d555139dee..e3f2e8b57e3df48d860734f164f41be95f6f3d96 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -58,6 +58,10 @@ TEST(DEV_API, flatten) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
 
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 23edfeacaf81436d6381be674c72a27ae96e0b41..ce31b2021e01a4130038e7e26bc37fd3e13ef27a 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 16ad4fc341be0ac68c571b29ffe182ae5d4c625f..7de039372fa9c2b46d5b6f9b430a816382072449 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -50,6 +50,10 @@ TEST(DEV_API, reshape) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index c1a8b853b32e38cb32e2081727e102164ffddb08..4800e1402ba56f2956c207f44f2656a71d50b92c 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -64,7 +64,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const float diff = 1e-3,
                     const bool backward = false,
                     const std::vector<T> features_grad = {},
-                    const std::vector<T> kernel_grad = {}) {
+                    const std::vector<T> kernel_grad = {},
+                    const bool subm = false) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -114,6 +115,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
@@ -130,15 +132,17 @@ void TestConv3dBase(const std::vector<int>& indices,
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
-      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
-                                                             x_tensor,
-                                                             rulebook,
-                                                             kernel_tensor,
-                                                             out,
-                                                             paddings,
-                                                             dilations,
-                                                             strides,
-                                                             1);
+      std::vector<DenseTensor> grads =
+          sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                x_tensor,
+                                rulebook,
+                                kernel_tensor,
+                                out.non_zero_elements(),
+                                paddings,
+                                dilations,
+                                strides,
+                                1,
+                                subm);
       f_verify(grads[0].data<T>(), features_grad);
       f_verify(grads[1].data<T>(), kernel_grad);
     }
@@ -191,6 +195,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &d_rulebook);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -227,15 +232,17 @@ void TestConv3dBase(const std::vector<int>& indices,
   f_verify(h_features_tensor.data<T>(), correct_out_features);
 
   if (backward) {
-    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
-                                                           d_x_tensor,
-                                                           d_rulebook,
-                                                           d_kernel_tensor,
-                                                           d_out,
-                                                           paddings,
-                                                           dilations,
-                                                           strides,
-                                                           1);
+    std::vector<DenseTensor> grads =
+        sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                              d_x_tensor,
+                              d_rulebook,
+                              d_kernel_tensor,
+                              d_out.non_zero_elements(),
+                              paddings,
+                              dilations,
+                              strides,
+                              1,
+                              subm);
     DenseTensor h_features_grad = phi::Empty(
         dev_ctx_cpu,
         DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
@@ -266,7 +273,8 @@ void TestConv3d(const std::vector<int>& indices,
                 const float diff = 1e-3,
                 const bool backward = false,
                 const std::vector<float> features_grad = {},
-                const std::vector<float> kernel_grad = {}) {
+                const std::vector<float> kernel_grad = {},
+                const bool subm = false) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -283,7 +291,8 @@ void TestConv3d(const std::vector<int>& indices,
                         diff,
                         backward,
                         features_grad,
-                        kernel_grad);
+                        kernel_grad,
+                        subm);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -300,7 +309,8 @@ void TestConv3d(const std::vector<int>& indices,
                          diff,
                          backward,
                          cast<float, double>(features_grad),
-                         cast<float, double>(kernel_grad));
+                         cast<float, double>(kernel_grad),
+                         subm);
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -661,5 +671,101 @@ TEST(DEV_API, sparse_conv3d_backward) {
              kernel_grad);
 }
 
+TEST(DEV_API, sparse_conv2d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 4, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 4, 5, out_channels};
+  std::vector<int> paddings = {0, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<int> indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> features = {0.8854, 0.6505, -0.1999, 0.3583};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> out_features = {0.1782, 0.2313, 0.7117, 0.5214};
+
+  std::vector<float> features_grad = {0.0359, 1.2080, 0.5838, 0.4541};
+  std::vector<float> kernel_grad = {
+      0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
+TEST(DEV_API, sparse_conv3d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 5, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 4, 4, 5, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> features = {-0.9578, 0.1572, 0.1036};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770,
+      0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845,
+      0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> out_features = {-0.7262, 0.1192, 0.0785};
+
+  std::vector<float> features_grad = {-0.5506, 0.0904, 0.0595};
+  std::vector<float> kernel_grad = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27673704168c9eace0958db770a2309d10da648c
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -0,0 +1,391 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace tests {
+
+template <typename T1, typename T2>
+std::vector<T2> cast(const std::vector<T1>& in) {
+  std::vector<T2> out(in.size());
+  for (uint64_t i = 0; i < in.size(); i++) {
+    out[i] = static_cast<T2>(in[i]);
+  }
+  return out;
+}
+template <typename T>
+void TestMaxPoolBase(const std::vector<int>& indices,
+                     const std::vector<T>& features,
+                     const DDim& x_dims,
+                     const std::vector<int>& correct_out_indices,
+                     const std::vector<T>& correct_out_features,
+                     const DDim& correct_out_dims,
+                     const int non_zero_num,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& dilations,
+                     const float diff = 1e-3,
+                     const bool backward = false,
+                     const std::vector<T> features_grad = {}) {
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.Init();
+
+  const int in_channels = x_dims[4];
+  const int out_channels = in_channels;
+
+  DenseTensor indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  DenseTensor features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims);
+
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+    SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
+                                             x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &rulebook);
+
+    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out.dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out.non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_cpu,
+                                                  x_tensor,
+                                                  rulebook,
+                                                  out,
+                                                  out.non_zero_elements(),
+                                                  kernel_sizes);
+      f_verify(x_grad.data<T>(), features_grad);
+    }
+  }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
+                                             d_x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu,
+                                                d_x_tensor,
+                                                d_rulebook,
+                                                d_out,
+                                                d_out.non_zero_elements(),
+                                                kernel_sizes);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout()));
+    phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+  }
+#endif
+}
+
+void TestMaxPool(const std::vector<int>& indices,
+                 const std::vector<float>& features,
+                 const DDim& x_dims,
+                 const std::vector<int>& correct_out_indices,
+                 const std::vector<float>& correct_out_features,
+                 const DDim& correct_out_dims,
+                 const int non_zero_num,
+                 const std::vector<int>& kernel_sizes,
+                 const std::vector<int>& paddings,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& dilations,
+                 const float diff = 1e-3,
+                 const bool backward = false,
+                 const std::vector<float> features_grad = {}) {
+  // test float
+  TestMaxPoolBase<float>(indices,
+                         features,
+                         x_dims,
+                         correct_out_indices,
+                         correct_out_features,
+                         correct_out_dims,
+                         non_zero_num,
+                         kernel_sizes,
+                         paddings,
+                         strides,
+                         dilations,
+                         diff,
+                         backward,
+                         features_grad);
+  // test double
+  TestMaxPoolBase<double>(indices,
+                          cast<float, double>(features),
+                          x_dims,
+                          correct_out_indices,
+                          cast<float, double>(correct_out_features),
+                          correct_out_dims,
+                          non_zero_num,
+                          kernel_sizes,
+                          paddings,
+                          strides,
+                          dilations,
+                          diff,
+                          backward,
+                          cast<float, double>(features_grad));
+}
+
+TEST(DEV_API, sparse_maxpool) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 3, 3};
+  std::vector<float> x_grad = {0, 4, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_stride) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 1, 1, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {2, 2, 2};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {0, 0, 0, 0};
+  std::vector<float> out_features = {2};
+  std::vector<float> x_grad = {0, 2, 0};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_channel) {
+  const int channels = 2;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool3d) {
+  const int channels = 2;
+  DDim x_dims = {1, 5, 4, 4, channels};
+  DDim out_dims = {1, 3, 2, 2, channels};
+  std::vector<int> kernel_sizes = {3, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index dfec291bc072f023dd09dba768cdeeb6e4cc3a34..82fa90c1574bd5c358d9e2325349811d43f5d973 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index 88c9193a8f8949bd6f315c9c4bdf89d6029a8696..36923972ea4145a63101f84eeb5da76d73ffce75 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -484,5 +484,98 @@ TEST(ARG_MAP, set_value) {
       "set_value");
 }
 
+TEST(ARG_MAP, set_value_grad) {
+  TestArgumentMappingContext arg_case(
+      {"Out@GRAD", "StartsTensorList", "EndsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case1(
+      {"Out@GRAD", "StartsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case1)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case2)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case3(
+      {"Out@GRAD", "EndsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case3)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case4)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case5)
+                .name,
+            "set_value_grad");
+}
+
+TEST(ARG_MAP, allclose) {
+  TestArgumentMappingContext arg_case1(
+      {"Input", "Other", "Rtol"},
+      {},
+      {{"atol", paddle::any(std::string{"1e-8"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature1 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case1);
+  ASSERT_EQ(signature1.name, "allclose");
+  auto attr_names1 = std::get<1>(signature1.args);
+  ASSERT_EQ(attr_names1[0], "Rtol");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "Other", "Atol"},
+      {},
+      {{"rtol", paddle::any(std::string{"1e-5"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature2 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case2);
+  ASSERT_EQ(signature2.name, "allclose");
+  auto attr_names2 = std::get<1>(signature2.args);
+  ASSERT_EQ(attr_names2[1], "Atol");
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 06048f33d940a28ddf9e3aa488a6e24a9e4a93b6..8468dad10eb64a066cc11dafa125dde3174b7e30 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_inputs.count(name) > 0;
   }
 
+  // add member if needed
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 76b45ff89f1869cc5e401b5c1b4151ad14158259..1b259023f94df7279066533bb6c182a644b4e9c2 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -44,6 +44,9 @@ function update_pd_ops() {
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
    python3 generate_phi_kernel_dialect.py
+   # generate test model
+   cd ${PADDLE_ROOT}
+   python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
 }
 
 function init() {
@@ -93,7 +96,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 35b2ce751b18fff2aac8dedfd09e5fe209d95533..78a863040ade1a43e9de660bff59f5179535abef 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
-if not defined BRANCH set BRANCH=develop
 if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
@@ -66,10 +65,10 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
-if not defined WITH_CLCACHE set WITH_CLCACHE=OFF
 if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_SCCACHE set WITH_SCCACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
@@ -144,17 +143,6 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     rmdir %BUILD_DIR% /s/q
-
-    : clear third party cache every once in a while
-    if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party /s/q
-    )
     goto :mkbuild
 )
 
@@ -211,6 +199,7 @@ echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')"
 xcopy sccache.exe %PYTHON_ROOT%\ /Y
+del sccache.exe
 goto:eof
 rem -------Caching strategy 2: End --------------------------------
 
@@ -231,13 +220,12 @@ set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
 set ON_INFER=OFF
 set WITH_TENSORRT=ON
+set WITH_INFERENCE_API_TEST=OFF
 
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for OPENBLAS/CPU------
@@ -253,8 +241,6 @@ call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX----------
@@ -264,7 +250,6 @@ set WITH_GPU=ON
 set WITH_AVX=ON
 set MSVC_STATIC_CRT=ON
 set ON_INFER=ON
-set WITH_TESTING=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=ON
 
@@ -273,7 +258,8 @@ call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
 ::call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
+::call :test_inference_ut || goto test_inference_ut_error
+call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------Build windows avx whl package------
@@ -364,18 +350,6 @@ if "%WITH_GPU%"=="ON" (
     nvidia-smi 2>NUL
 )
 
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
 rem ------set third_party cache dir------
 
 if "%WITH_TPCACHE%"=="OFF" (
@@ -383,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" (
     goto :cmake_impl
 )
 
+rem clear third party cache every ten days
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day_third_party.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day_third_party.txt
+    type %cache_dir%\day_third_party.txt
+    if %day_now% EQU 21 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 11 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 01 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+)
+
 echo set -ex > cache.sh
 echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
 echo echo ${md5_content}^>md5.txt >> cache.sh
@@ -534,11 +527,7 @@ echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja all
 ) else (
-    if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    ) else (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    )
+    MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
 )
 
 if %ERRORLEVEL% NEQ 0 (
@@ -757,7 +746,7 @@ for /F %%i in ("%libsize%") do (
 )
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
@@ -773,77 +762,8 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-cd /d %work_dir%\%BUILD_DIR%
-echo set -e>  check_change_of_unittest.sh
-echo set +x>> check_change_of_unittest.sh
-echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
-echo GIT_PR_ID=%AGILE_PULL_ID% >>  check_change_of_unittest.sh
-echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
-echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
-echo     exit 0 >>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo set -x>> check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
-echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
-echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh
-echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
-echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
-echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>>  check_change_of_unittest.sh
-echo         ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
-echo     git remote remove upstream>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_change_of_unittest.sh
-echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -b origin_pr >>  check_change_of_unittest.sh
-echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
--DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
--DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
--DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%  >>  check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
-echo if [ "$unittest_spec_diff" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo     set +x>> check_change_of_unittest.sh
-echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     set -x>> check_change_of_unittest.sh
-echo     if [ "$approval_line" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
-echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
-echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             echo -e "It is forbidden to disable or delete the unit-test.\n"        >>  check_change_of_unittest.sh
-echo             echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."     >>  check_change_of_unittest.sh
-echo             echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n"                 >>  check_change_of_unittest.sh
-echo             echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"   >>  check_change_of_unittest.sh
-echo             echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"     >>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             exit 1 >>  check_change_of_unittest.sh
-echo          fi>>  check_change_of_unittest.sh
-echo     else>>  check_change_of_unittest.sh
-echo          exit 1 >>  check_change_of_unittest.sh
-echo     fi>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -f origin_pr >>  check_change_of_unittest.sh
-%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh
+
 goto:eof
 
 :check_change_of_unittest_error
@@ -857,7 +777,7 @@ echo    Step 7. Testing fluid library with infer_ut for inference ...
 echo    ========================================
 
 cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_ut_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 84f7a57999fd66a6c24ae3ccf88c93f9beaa97e5..39676b916e50470ac9774f3564b4bdc3a8fcb20f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -229,6 +229,7 @@ function cmake_base() {
         -DWITH_CNCL=${WITH_CNCL:-OFF}
         -DWITH_XPU=${WITH_XPU:-OFF}
         -DWITH_MLU=${WITH_MLU:-OFF}
+        -DWITH_IPU=${WITH_IPU:-OFF}
         -DLITE_GIT_TAG=release/v2.10
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
@@ -280,6 +281,7 @@ EOF
         -DLITE_GIT_TAG=release/v2.10 \
         -DWITH_XPU=${WITH_XPU:-OFF} \
         -DWITH_MLU=${WITH_MLU:-OFF} \
+        -DWITH_IPU=${WITH_IPU:-OFF} \
         -DWITH_CNCL=${WITH_CNCL:-OFF} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
@@ -948,8 +950,17 @@ function generate_upstream_develop_api_spec() {
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     startTime_firstBuild=`date +%s`
-    cmake_gen $1
-    build $2
+
+    dev_commit=`git log -1|head -1|awk '{print $2}'`
+    dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
+    url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
+    if [ "$url_return" == '200' ];then
+        mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url}
+    else
+        cmake_gen $1
+        build $2
+    fi
+
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
     echo "pr_whl_size: ${pr_whl_size}"
@@ -1274,6 +1285,8 @@ function card_test() {
         CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
     elif [ "${WITH_MLU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_IPU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2231,6 +2244,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_ipu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit ipu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single IPU
+        collect_failed_tests
+
+        # add unit test retry for IPU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2248,6 +2385,8 @@ function parallel_test() {
         parallel_test_base_npu
     elif [ "$WITH_MLU" == "ON" ];then
         parallel_test_base_mlu
+    elif [ "$WITH_IPU" == "ON" ];then
+        parallel_test_base_ipu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -3013,6 +3152,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_ipu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 36ca048c51210ff7c12679731653ce026206b3c6..6fc6f7d3d494a28b822c3044716ec66867538a3d 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -81,15 +81,14 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} connot be empyt".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var, paddle.
-                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+                assert isinstance(each_var, (
+                    paddle.Tensor, core.eager.Tensor
+                )), "Elements of {} must be paddle.Tensor".format(name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
-                    name)
+            assert isinstance(in_out_list, (
+                paddle.Tensor, core.eager.Tensor
+            )), "{} must be Tensor or list of Tensor".format(name)
             return [in_out_list]
 
     tensors = check_tensors(tensors, "tensors")
@@ -105,10 +104,13 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         for each_tensor in grad_tensors:
             if each_tensor is not None:
                 assert isinstance(
-                    each_tensor, paddle.Tensor
+                    each_tensor, (paddle.Tensor, core.eager.Tensor)
                 ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
     else:
-        grad_tensors = [None] * len(tensors)
+        if core._in_eager_mode():
+            grad_tensors = []
+        else:
+            grad_tensors = [None] * len(tensors)
 
     if len(grad_tensors) > 0:
         assert len(tensors) == len(
@@ -116,5 +118,8 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
 
     assert isinstance(retain_graph, bool), "retain_graph must be True or False"
 
-    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
-                              framework._dygraph_tracer())
+    if core._in_eager_mode():
+        core.eager.run_backward(tensors, grad_tensors, retain_graph)
+    else:
+        core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                                  framework._dygraph_tracer())
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index ae2d9163435b906f17e9b28a680302d2bd305bbc..e303ce1216822b26bb58813c37239ae3e3fec043 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -21,11 +21,12 @@ from paddle.fluid import framework
 
 from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
-from .dist_context import get_default_distributed_context
+from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
+from .process_mesh import ProcessMesh
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
 
@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list):
     return compatible_result
 
 
+def merge_process_mesh_two(pm1, pm2):
+    process_set1 = set()
+    process_set2 = set()
+    if pm1 is None and pm2 is None:
+        return None
+    if pm1 is not None:
+        process_set1 = set(pm1.processes)
+    if pm2 is not None:
+        process_set2 = set(pm2.processes)
+    merged_process_set = process_set1.union(process_set2)
+    merged_process_mesh = ProcessMesh(list(merged_process_set))
+    return merged_process_mesh
+
+
 class Completer:
     def __init__(self, dist_context):
         assert dist_context is not None
@@ -119,7 +134,9 @@ class Completer:
             return False
         tensor_desc = tensor_node.var()
         # Skip reader tensor
-        if tensor_desc.type() == core.VarDesc.VarType.READER:
+        if tensor_desc.type() == core.VarDesc.VarType.READER \
+            or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES:
             return False
         tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
             tensor_node)
@@ -185,7 +202,7 @@ class Completer:
         op_dist_attr = dist_op.dist_attr
         if fwd:
             for tensor_node in op_node.inputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -208,19 +225,19 @@ class Completer:
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=True)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         else:
             for tensor_node in op_node.outputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -243,61 +260,38 @@ class Completer:
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=False)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         return changed
 
-    def _update_process_mesh(self):
-        def _find_nearset_node(nodes, idx):
-            for node in reversed(nodes[:idx]):
-                node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                    node)
-                if node_dist_attr.process_mesh is not None:
-                    return node
-
-        total_reach_fix_point = False
-        while not total_reach_fix_point:
-            total_changed = False
-            for is_fwd in [True, False]:
-                all_nodes = self._dist_context.serial_ordered_nodes \
-                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
-                reach_fix_point = False
-                while not reach_fix_point:
-                    changed = False
-                    for idx, node in enumerate(all_nodes):
-                        nearest_node = _find_nearset_node(
-                            self._dist_context.serial_ordered_nodes, idx)
-                        if nearest_node is None:
-                            continue
-                        nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph(
-                            nearest_node)
-                        nearest_process_mesh = nearest_node_dis_attr.process_mesh
-                        cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                            node)
-                        cur_process_mesh = cur_node_dist_attr.process_mesh
-                        compatible_process_mesh = compute_compatible_process_mesh(
-                            [cur_process_mesh, nearest_process_mesh])
-                        if compatible_process_mesh is not None \
-                            and cur_process_mesh != compatible_process_mesh:
-                            cur_node_dist_attr.process_mesh = compatible_process_mesh
-                            changed = True
-                    if changed:
-                        reach_fix_point = False
-                        total_changed = True
-                    else:
-                        reach_fix_point = True
-            if total_changed:
-                total_reach_fix_point = False
-            else:
-                total_reach_fix_point = True
+    def _update_dims_mapping_between_graphs(self):
+        changed = False
+        for parent_node, child_node in self._node_pairs_between_graphs:
+            parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                parent_node)
+            child_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                child_node)
+            parent_node_dims_mapping = parent_node_dist_attr.dims_mapping
+            child_node_dims_mapping = child_node_dist_attr.dims_mapping
+            compatible_dims_mapping = compute_compatible_dims_mapping(
+                [parent_node_dims_mapping, child_node_dims_mapping])
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != parent_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != child_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+        return changed
 
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
@@ -318,11 +312,314 @@ class Completer:
                             node, fwd=is_fwd)
                         if op_changed:
                             changed = True
+                graph_changed = self._update_dims_mapping_between_graphs()
+                if graph_changed:
+                    changed = True
             if changed:
                 reach_fix_point = False
             else:
                 reach_fix_point = True
 
+    def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
+        op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+        # Set the process mesh of the op node by its nearest op node
+        if not op_dist_attr.is_annotated("process_mesh"):
+            process_mesh = op_dist_attr.process_mesh
+            nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            nearest_process_mesh = nearest_op_dis_attr.process_mesh
+            compatible_process_mesh = compute_compatible_process_mesh(
+                [process_mesh, nearest_process_mesh])
+            if compatible_process_mesh is not None \
+                and process_mesh != compatible_process_mesh:
+                op_dist_attr.process_mesh = compatible_process_mesh
+        # Skip the process_mesh setting of inputs and outputs of while_op
+        if op_dist_attr.op_type == "while":
+            return
+        # Set the process mesh of the op node's leaf-inputs
+        for tensor_node in op_node.inputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                # Skip the non-leaf var node
+                if len(tensor_node.inputs) != 0:
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+        # Set the process mesh of the op node's outputs
+        for tensor_node in op_node.outputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+
+    def _update_process_mesh_for_specials(self):
+        def _find_nearest_tensor_node_before(nodes, idx, var_name):
+            for node in reversed(nodes[:idx]):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nearest_tensor_node_after(nodes, idx, var_name):
+            for node in nodes[idx + 1:]:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nodes_related_to_cond(source_node):
+            related_nodes = []
+            visited = set()
+            frontier = list()
+            frontier.append(source_node)
+            # BFS
+            while len(frontier) != 0:
+                cur = frontier[0]
+                frontier = frontier[1:]
+                if _node_id(cur) in visited:
+                    continue
+                # TODO: need more restrictions
+                for node in cur.inputs:
+                    if node.is_var() and node.var() is not None:
+                        if node.var().type() != core.VarDesc.VarType.READER \
+                            and len(node.var().shape()) == 1:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                    if node.is_op() and node.op() is not None:
+                        flag = True
+                        if node.op().type() == "create_py_reader" \
+                            or node.op().type() == "create_double_buffer_reader" \
+                            or node.op().type() == "read":
+                            flag = False
+                        for tensor_node in node.inputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        for tensor_node in node.outputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        if flag:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                visited.add(_node_id(cur))
+            return related_nodes
+
+        # Amend the process meshes related to while_op
+        for while_op_node, while_op_node_idx in self._while_op_nodes.values():
+            sub_graph_id = while_op_node.op()._block_attr_id("sub_block")
+            sub_graph = self._dist_context._serial_graph.get_sub_graph(
+                sub_graph_id)
+            sub_graph_nodes = list(sub_graph.all_nodes())
+            while_dist_op = self._dist_context.get_dist_op_for_graph(
+                while_op_node)
+            while_op_dist_attr = while_dist_op.dist_attr
+
+            # Step 1: set the process mesh of while_op to the merged process mesh of its subblock
+            merged_process_mesh = while_op_dist_attr.process_mesh
+            for node in sub_graph_nodes:
+                if (node.is_var() and node.var() is not None) \
+                    or (node.is_op() and node.op() is not None):
+                    dist_attr = self._dist_context.get_dist_attr_for_graph(node)
+                    merged_process_mesh = merge_process_mesh_two(
+                        merged_process_mesh, dist_attr.process_mesh)
+            while_op_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 2: set the related nodes of while_op to the process mesh of while_op
+            # Step 2.1: Find related nodes of cond var the graph of while_op
+            cond_tensor_related_nodes = []
+            cond_tensor_name = while_op_node.op().input("Condition")[0]
+            cond_tensor_node = None
+            for node in while_op_node.inputs:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name:
+                    cond_tensor_node = node
+                    cond_tensor_related_nodes.append(cond_tensor_node)
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+
+            # Step 2.2: Find related nodes of cond var in the subgraph of while_op
+            cond_tensor_node = None
+            for node in reversed(sub_graph_nodes):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name \
+                        and len(node.outputs) == 0:
+                    cond_tensor_node = node
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+            # Step 2.3: Add the StepScops output of while_op
+            stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
+            stepscopes_tensor_node = None
+            for output_node in while_op_node.outputs:
+                if output_node.is_var() and output_node.var() is not None \
+                    and output_node.var().name() == stepscopes_tensor_name:
+                    stepscopes_tensor_node = output_node
+            cond_tensor_related_nodes.append(stepscopes_tensor_node)
+            # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op
+            for node in cond_tensor_related_nodes:
+                tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    node)
+                tensor_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes
+            while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+            # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes
+            while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                if nearest_tensor_node is None:
+                    nearest_tensor_node = _find_nearest_tensor_node_after(
+                        self._dist_context.serial_ordered_nodes,
+                        while_op_node_idx, tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+        # Amend the process meshes related to array
+        for array_node_list in self._array_nodes.values():
+            merged_process_mesh = None
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                merged_process_mesh = merge_process_mesh_two(
+                    merged_process_mesh, dist_attr.process_mesh)
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                dist_attr.process_mesh = merged_process_mesh
+
+    def _update_process_mesh(self):
+        ordered_op_nodes = self._dist_context._serial_ordered_op_nodes
+
+        # Step 1: Set the annotated process meshes from tensors to the first ops using them
+        ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes
+        for tensor_node in ordered_tensor_nodes:
+            tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                tensor_node)
+            if not tensor_dist_attr.is_annotated("process_mesh"):
+                continue
+            first_op_node = None
+            for op_node in ordered_op_nodes:
+                # TODO: Need a better rule for the control flow ops.
+                # For now, do not set the process mesh of while_op from its inputs
+                if op_node.op().type() == "while":
+                    continue
+                for input_tensor_node in op_node.inputs:
+                    if _node_id(tensor_node) == _node_id(input_tensor_node):
+                        first_op_node = op_node
+                        break
+                if first_op_node is not None:
+                    break
+            if first_op_node is None:
+                continue
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                first_op_node)
+            if op_dist_attr is not None and not op_dist_attr.is_annotated(
+                    "process_mesh"):
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and op_dist_attr.process_mesh != compatible_process_mesh:
+                    op_dist_attr.process_mesh = compatible_process_mesh
+
+        # Step 2: set the process meshes of ops with the nearest op before them
+        # Step 2.1: find the first op node which has the process mesh
+        idx_of_first_op_node_has_process_mesh = -1
+        for idx, op_node in enumerate(ordered_op_nodes):
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            if op_dist_attr.process_mesh is not None \
+                and idx_of_first_op_node_has_process_mesh == -1:
+                idx_of_first_op_node_has_process_mesh = idx
+                # Reuse the following method to set the related tensors for same op node
+                self._update_process_mesh_by_nearest(op_node, op_node)
+        # Step 2.2: set the process meshes of ops by the nearest op node after the first op node
+        if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
+            return None
+        for idx, op_node in enumerate(ordered_op_nodes[
+                idx_of_first_op_node_has_process_mesh + 1:]):
+            original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1
+            nearest_op_node = ordered_op_nodes[original_idx - 1]
+            nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            assert nearest_op_dist_attr.process_mesh is not None
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+        # Step 2.3: set the process meshes of ops by the nearest op node before the first op node
+        nearest_op_node = ordered_op_nodes[
+            idx_of_first_op_node_has_process_mesh]
+        for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]:
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+
+        # Step 3: adjust the process meshes for special ops
+        self._update_process_mesh_for_specials()
+
+    def _prepare(self):
+        self._while_op_nodes = {}
+        self._array_nodes = {}
+        self._node_pairs_between_graphs = []
+        all_nodes = self._dist_context.serial_ordered_nodes
+        for idx, node in enumerate(all_nodes):
+            if node.is_op():
+                if node.op().type() == "while":
+                    self._while_op_nodes[_node_id(node)] = (node, idx)
+                if node.op().type() == "read_from_array":
+                    array_var_name = node.op().input("X")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                if node.op().type() == "write_to_array":
+                    array_var_name = node.op().output("Out")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                    self._array_nodes[array_var_name].append(node.outputs[0])
+            if node.is_var() and node.var() is not None:
+                if node.node.graph_id() != 0:
+                    for before_node in reversed(all_nodes[:idx]):
+                        if before_node.is_var() and before_node.var() is not None \
+                            and before_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and before_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (before_node, node))
+                    for after_node in all_nodes[idx + 1:]:
+                        if after_node.is_var() and after_node.var() is not None \
+                            and after_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and after_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (after_node, node))
+
     def complete_forward_annotation(self, serial_main_program):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
@@ -336,24 +633,24 @@ class Completer:
 
         # Initialize distributed attributes for all var and op node in serial_main_program
         self._dist_context.init_dist_attr_for_program()
+        # print_program_with_dist_attr(serial_main_program, self._dist_context)
 
         # Initialize distributed attributes for all var and op node in graph
         self._dist_context.init_dist_attr_for_graph()
 
+        self._prepare()
+
         self._update_process_mesh()
 
-        # Complete dims_mapping for each node
         self._update_dims_mapping()
 
         # Copy the corresponding distributed attribute from graph to serial_main_program
         self._dist_context.copy_dist_attr_from_graph_to_program()
         self._dist_context.clear_dist_info_for_graph()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         # Do the validation check and amend some completion
         self._dist_context.amend_dist_attr_for_program()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         self._dist_context.validate_dist_attr_for_program()
 
         return serial_main_program
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d88f9fe7501b56be255448a412fdcc6ec56cd13b
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -0,0 +1,455 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import warnings
+import logging
+import numpy as np
+from ..utils import get_logger
+
+
+class Converter(object):
+    """
+    Converter is a class object for auto parallel to convert tensors from 
+    one parallel strategy to another one. Tensors will merge and slice value 
+    with their strategy when strategies are different.
+    """
+
+    def __init__(self, tensors_dict, pre_strategy, cur_strategy):
+        """
+        Args:
+            tensors_dict(dict): tensors' value of all ranks that to be converted. 
+                key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
+            pre_strategy(dict): tensors' distributed attribute of last training process.
+                key is tensor's name(str), value is tensor's distributed attribute in last 
+                training process.
+            cur_strategy(dict): tensors' distributed attribute of current rank.
+                key is tensor's name(str), value is tensor's distributed attribute in current
+                rank.
+        """
+        self._tensors_dict = self._check_tensor_dict(tensors_dict)
+        self._pre_strategy = self._check_pre_strategy(pre_strategy)
+        self._cur_strategy = self._check_cur_strategy(cur_strategy)
+        self._logger = get_logger(logging.INFO)
+
+    def _check_tensor_dict(self, tensors_dict):
+        if not tensors_dict:
+            raise ValueError("'tensors_dict' is None, "
+                             "the tensors to be converted cannot be None.")
+        if not isinstance(tensors_dict, dict):
+            raise TypeError(
+                "The type of 'tensors_dict' should be 'dict', but got '{}'.".
+                format(str(type(tensors_dict))))
+        return tensors_dict
+
+    def _check_pre_strategy(self, pre_strategy):
+        if not pre_strategy:
+            raise ValueError("'pre_strategy' is None, "
+                             "there are not tensors in pre process.")
+        if not isinstance(pre_strategy, dict):
+            raise TypeError("The type of 'pre_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(pre_strategy))))
+        return pre_strategy
+
+    def _check_cur_strategy(self, cur_strategy):
+        if not cur_strategy:
+            warnings.warn("'cur_strategy' is None, "
+                          "there are not tensors in cur process")
+        if not isinstance(cur_strategy, dict):
+            raise TypeError("The type of 'cur_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(cur_strategy))))
+        return cur_strategy
+
+    def convert(self, strict=True):
+        """
+        Convert tensors
+
+        Args:
+            strict(bool): whether to strict convert tensor with tensor's name. If False, it will
+            convert tensors by prefix matching. Otherwise, tensors will be converted with
+            their name strictly.
+
+        Returns:
+            converted tensors(dict)
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensors = np.arange(4).reshape([2, 2])
+                partitial_tensors = np.split(complete_tensors, 2, axis=0)
+                name = "tmp_0"
+                tensors_dict = {name: partitial_tensors}
+                strategy_1 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [0, -1]
+                    }
+                }
+                strategy_2 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [-1, -1]
+                    }
+                }
+                converter = Converter(tensors_dict, strategy_1, strategy_2)
+                result = converter.convert()
+                # the result's value is equal to `complete_tensors`
+        """
+        tensors_dict = {}
+        # the name which is in cur_process but not in pre_process
+        tensor_not_in_pre = []
+        # the name which is in pre_process but not in cur_process
+        tensor_not_in_cur = []
+        # the name which is in strategy but not in ckpt files
+        tensor_not_in_ckpt = []
+        self._logger.info("Start to convert tensors.")
+        for tensor_name in self._cur_strategy:
+            if tensor_name not in self._pre_strategy:
+                tensor_not_in_pre.append(tensor_name)
+                continue
+            if tensor_name not in self._tensors_dict:
+                tensor_not_in_ckpt.append(tensor_name)
+                continue
+            self._pre_name = tensor_name
+            self._cur_name = tensor_name
+            tensor_list = self._tensors_dict[tensor_name]
+            pre_dist_attr = self._pre_strategy[tensor_name]
+            cur_dist_attr = self._cur_strategy[tensor_name]
+            try:
+                tensors_dict[tensor_name] = Converter.merge_and_slice(
+                    tensor_list, pre_dist_attr, cur_dist_attr)
+            except ValueError as err:
+                raise ValueError("Fail to convert tensor '{}'. "
+                                 .format(str(tensor_name)) + str(err))
+
+        for tensor_name in self._pre_strategy:
+            if tensor_name not in self._cur_strategy:
+                tensor_not_in_cur.append(tensor_name)
+
+        if not strict:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = self.convert_with_prefix_match(
+                tensors_dict, tensor_not_in_pre, tensor_not_in_cur)
+        else:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = tensors_dict, [], []
+
+        tensor_not_in_pre = set(tensor_not_in_pre) - set(tensor_match_with_pre)
+        tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur)
+        if tensor_not_in_pre:
+            warnings.warn(
+                "tensors [{}] are not found in last training strategy."
+                .format(str(tensor_not_in_pre)))
+        if tensor_not_in_cur:
+            warnings.warn(
+                "tensors [{}] are not found in current training strategy."
+                .format(str(tensor_not_in_cur)))
+        if tensor_not_in_ckpt:
+            warnings.warn(
+                "tensors [{}] are found in pre_strategy, but are not found"
+                "in checkpoint files, please check your checkpoint files."
+                .format(str(tensor_not_in_ckpt)))
+
+        return tensors_dict
+
+    def convert_with_prefix_match(self, tensors_dict, tensor_not_in_pre,
+                                  tensor_not_in_cur):
+        # the name which in cur_process and can match with pre_process
+        tensor_match_with_pre = []
+        # the name which in pre_process and can match with cur_process
+        tensor_match_with_cur = []
+        for cur_name in tensor_not_in_pre:
+            prefix_name = cur_name
+            while prefix_name.find("_") != -1:
+                prefix_name = prefix_name[:prefix_name.rfind("_")]
+                for pre_name in tensor_not_in_cur:
+                    if prefix_name in pre_name:
+                        # 'cur_name' of cur_process can match with 'pre_name' of pre_process
+                        self._pre_name = pre_name
+                        self._cur_name = cur_name
+                        pre_tensor_list = self._tensors_dict[pre_name]
+                        pre_dist_attr = self._pre_strategy[pre_name]
+                        cur_dist_attr = self._cur_strategy[cur_name]
+                        try:
+                            tensors_dict[cur_name] = Converter.merge_and_slice(
+                                pre_tensor_list, pre_dist_attr, cur_dist_attr)
+                        except ValueError as err:
+                            raise ValueError(
+                                "Fail to convert tensor '{}' by '{}'. ".format(
+                                    str(cur_name), str(pre_name)) + str(err))
+                        self._logger.info(
+                            "tensor [{}] is matched with tensor [{}]".format(
+                                cur_name, pre_name))
+                        tensor_match_with_pre.append(cur_name)
+                        tensor_match_with_cur.append(pre_name)
+                        break
+                break
+
+        return tensors_dict, tensor_match_with_pre, tensor_match_with_cur
+
+    @staticmethod
+    def merge_and_slice(tensor_list, pre_dist_attr, cur_dist_attr):
+        """
+        Merge tensors with previous dist_attr and slice tensors with current dist_attr
+
+        Returns:
+            tensor(numpy.narray): a tensor's value of current rank.
+        """
+        assert isinstance(tensor_list, list)
+        assert all(isinstance(p, np.ndarray) for p in tensor_list)
+
+        if pre_dist_attr == cur_dist_attr:
+            # skip merge and slice tensor
+            rank_id = paddle.distributed.get_rank()
+            index = cur_dist_attr["process_group"].index(rank_id)
+            tensor = tensor_list[index]
+        else:
+            pre_dims_mapping = pre_dist_attr["dims_mapping"]
+            cur_dims_mapping = cur_dist_attr["dims_mapping"]
+            if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping:
+                # merge tensor
+                tensor = Converter.merge_with_dist_attr(tensor_list,
+                                                        pre_dist_attr)
+            else:
+                # skip merge tensor
+                tensor = tensor_list[0]
+
+            if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
+                # slice tensor
+                tensor = Converter.slice_with_dist_attr(tensor, cur_dist_attr)
+
+        return tensor
+
+    @staticmethod
+    def merge_with_dist_attr(tensor_list, dist_attr):
+        """ Merge tensor with distributed attribute """
+        from .reshard import _compute_complete_shape, _compute_partition_index
+
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # get the complete shape of the tensor
+        complete_shape = _compute_complete_shape(tensor_list[0].shape,
+                                                 process_shape, dims_mapping)
+        # merge the tensor with dist_attr
+        partition_tensor_list = []
+        merged_partiton = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            index = process_group.index(process)
+            if partition_index not in merged_partiton:
+                merged_partiton.append(partition_index)
+                Converter.merge(partition_tensor_list, tensor_list[index],
+                                partition_index, complete_shape)
+
+        if len(partition_tensor_list) != 1:
+            raise ValueError("Fail to merge tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        complete_tensor = partition_tensor_list[0][0]
+        return complete_tensor
+
+    @staticmethod
+    def slice_with_dist_attr(tensor, dist_attr):
+        """ Slice tensor with distributed attribute """
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # slice the tensor with dist_attr
+        partition_index_list = Converter._get_split_indices(
+            tensor.shape, dims_mapping, process_shape, process_group)
+        sliced_tensor_list = Converter.split(tensor, partition_index_list,
+                                             len(partition_index_list))
+        # get the current tensor's index in sliced_tensor_list
+        rank_id = paddle.distributed.get_rank()
+        sliced_tensor_index = Converter._get_sliced_index(
+            rank_id, tensor.shape, dims_mapping, process_shape, process_group)
+        if sliced_tensor_index not in range(len(sliced_tensor_list)):
+            raise ValueError("Fail to slice tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        sliced_tensor = sliced_tensor_list[sliced_tensor_index]
+        return sliced_tensor
+
+    @staticmethod
+    def merge(partition_tensor_list, tensor, partition_index, complete_shape):
+        """
+        Merge partitial tensors to a complete.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                partition_tensor_list = [(np.array([[[1.11, 1.12]]]), [[0,1],[0,1],[0,2]])]
+                tensor = np.array([[[1.13, 1.14]]])
+                partition_index = [[0,1],[0,1],[2,4]]
+
+                _merge_tensor(partition_tensor_list, tensor, partition_index)
+                # partition_tensor_list: [(np.array([[[1.11, 1.12, 1.13, 1.14]]]), [[0,1],[0,1],[0,4]])]
+        """
+        from .reshard import _compute_concat_info
+
+        if len(partition_tensor_list) == 1:
+            is_complete_data = True
+            for idx, item in enumerate(partition_tensor_list[0][1]):
+                if item[0] != 0 or item[1] != complete_shape[idx]:
+                    is_complete_data = False
+                    break
+            if is_complete_data:
+                return
+
+        if not partition_tensor_list:
+            partition_tensor_list.append((tensor, partition_index))
+        else:
+            i = 0
+            while i < len(partition_tensor_list):
+                concat_axis, first_order, new_partition = _compute_concat_info(
+                    partition_tensor_list[i][1], partition_index)
+                if concat_axis != -1:
+                    if first_order == 0:
+                        new_tensor = np.concatenate(
+                            (partition_tensor_list[i][0], tensor),
+                            axis=concat_axis)
+                    else:
+                        new_tensor = np.concatenate(
+                            (tensor, partition_tensor_list[i][0]),
+                            axis=concat_axis)
+
+                    partition_tensor_list.pop(i)
+                    Converter.merge(partition_tensor_list, new_tensor,
+                                    new_partition, complete_shape)
+                    break
+                i += 1
+
+    @staticmethod
+    def split(complete_tensor, partition_index_list, length):
+        """
+        Slice a complete tensor.
+
+        Returns:
+            sliced_tensor_list(list): sliced tensors with 'partition_index_list'
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                sliced_tensor_list = split(complete_tensor, [[], [], [2, 4]], 3)
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+        """
+        sliced_tensor_list = []
+        axis = len(complete_tensor.shape) - length
+        sliced_tensor = np.split(
+            complete_tensor, partition_index_list[axis], axis=axis)
+        if length == 1:
+            return sliced_tensor
+        for tensor in sliced_tensor:
+            sliced_tensor_list.extend(
+                Converter.split(tensor, partition_index_list, length - 1))
+        return sliced_tensor_list
+
+    @staticmethod
+    def _get_split_indices(complete_shape, dims_mapping, process_shape,
+                           process_group):
+        """
+        Get split indices of every dimension.
+
+        Returns:
+            split_indices_list(list): the split indices of every dimension of the tensor
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
+                # index: [[], [], [2, 4]]
+        """
+        from .reshard import _compute_partition_index
+
+        split_indices_list = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            if split_indices_list:
+                for dim in range(len(partition_index)):
+                    split_indices_list[dim].extend(partition_index[dim])
+            else:
+                split_indices_list = partition_index
+        split_indices_list = list(
+            map(lambda x, y: list(set(x) - set([y]) - set([0])),
+                split_indices_list, complete_shape))
+        split_indices_list = [sorted(x) for x in split_indices_list]
+        return split_indices_list
+
+    @staticmethod
+    def _get_sliced_index(rank_id, complete_shape, dims_mapping, process_shape,
+                          process_group):
+        """
+        Get sliced_tensor's index of current rank in all sliced tensors list.
+
+        Returns:
+            sliced_tensor_index(int): the index of sliced tensor in sliced_tensor_list
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
+                # slice_tensor: 
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+
+                index = _get_sliced_index(rank, complete_shape, dims_mapping
+                                                process_shape, process_group)
+                # index: 2
+        """
+        from .reshard import _compute_partition_index
+
+        partition_index = _compute_partition_index(
+            rank_id, complete_shape, dims_mapping, process_shape, process_group)
+        sliced_index = 0
+        for i, shape in enumerate(complete_shape):
+            if dims_mapping[i] == -1:
+                slice_shape = shape
+            else:
+                slice_shape = shape // process_shape[dims_mapping[i]]
+            if shape == 1:
+                index = 0
+            else:
+                index = (partition_index[i][0] + 1) // slice_shape
+            sliced_index = sliced_index * (shape // slice_shape) + index
+        return sliced_index
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index b27cd7a37c95626584194ae7bd619ab16a0e5ea7..8ec702ffcb0b65af96833b4d4d2be1c8ff08d788 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -175,6 +175,7 @@ class TensorDistributedAttribute:
 class OperatorDistributedAttribute:
     def __init__(self):
         self._process_mesh = None
+        self._op_type = None
         self._impl_type = None
         self._impl_idx = None
         self._inputs_dist_attrs = {}
@@ -194,11 +195,23 @@ class OperatorDistributedAttribute:
             if isinstance(process_mesh, list):
                 process_mesh = ProcessMesh(process_mesh)
             self._process_mesh = copy.deepcopy(process_mesh)
+            # In while op, the proess mesh is not shared by all inputs and outputs 
+            if self._op_type == "while":
+                return None
             for dist_attr in self._inputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
             for dist_attr in self._outputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
 
+    @property
+    def op_type(self):
+        return self._op_type
+
+    @op_type.setter
+    def op_type(self, op_type):
+        if op_type is not None:
+            self._op_type = op_type
+
     @property
     def impl_type(self):
         return self._impl_type
@@ -326,6 +339,8 @@ class OperatorDistributedAttribute:
                     assert False, "No setter for {} in args {}.".format(
                         key, dist_attr)
         # Make sure proscess_meshes in dist op be same
+        if self.op_type == "while":
+            return None
         process_meshes = []
         process_meshes.append(self.process_mesh)
         for tensor_dist_attr in self.inputs_dist_attrs.values():
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 573f23fdca519ae1da10d62ef7eb2da6238805f3..2807c46540ab1e52f7490c850faa34eac00c04db 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -15,6 +15,7 @@
 import copy
 from collections import defaultdict
 from paddle.fluid import framework
+from paddle.fluid.framework import get_flags, set_flags
 from paddle.fluid import core
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context):
     _g_default_distributed_context = dist_context
 
 
+def _node_id(node):
+    return (node.node.graph_id(), node.node.id())
+
+
 class DistributedContext:
     """
     DistributedContext is used to collect related distributed information for program and graph.
@@ -146,7 +151,7 @@ class DistributedContext:
                 return None
 
     def get_dist_tensor_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
 
     def get_dist_op_for_program(self, serial_op):
@@ -168,7 +173,7 @@ class DistributedContext:
             del self._dist_ops_for_program[serial_tensor_id]
 
     def get_dist_op_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         return self._dist_ops_for_graph.get(serial_op_node_id, None)
 
     def get_tensor_dist_attr_for_program(self, serial_tensor):
@@ -197,7 +202,7 @@ class DistributedContext:
         self.add_dist_tensor_for_program(dist_tensor)
 
     def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
                                                        None)
         if dist_tensor:
@@ -242,7 +247,7 @@ class DistributedContext:
         self.add_dist_op_for_program(dist_op)
 
     def get_op_dist_attr_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
         if dist_op:
             return dist_op.dist_attr
@@ -262,7 +267,7 @@ class DistributedContext:
 
     def get_dist_attr_for_graph(self, serial_node):
         if serial_node.is_var() and serial_node.var() is not None:
-            serial_tensor_node_id = serial_node.id()
+            serial_tensor_node_id = _node_id(serial_node)
             dist_tensor = self._dist_tensors_for_graph.get(
                 serial_tensor_node_id, None)
             if dist_tensor:
@@ -270,7 +275,7 @@ class DistributedContext:
             else:
                 return None
         if serial_node.is_op() and serial_node.op() is not None:
-            serial_op_node_id = serial_node.id()
+            serial_op_node_id = _node_id(serial_node)
             dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
             if dist_op:
                 return dist_op.dist_attr
@@ -311,40 +316,69 @@ class DistributedContext:
     def order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
             for node in nodes:
-                if node.id() == target_node.id():
+                if _node_id(node) == _node_id(target_node):
                     return True
             return False
 
-        ordered_tensor_nodes = []
-        ordered_op_nodes = []
-        all_nodes = self._serial_graph.all_nodes()
+        serial_ordered_tensor_nodes = []
+        serial_ordered_op_nodes = []
+        all_nodes = []
+        # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+        for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+            for node in graph.all_nodes():
+                all_nodes.append(node)
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                ordered_tensor_nodes.append(node)
+                serial_ordered_tensor_nodes.append(node)
             if node.is_op() and node.op() is not None:
-                ordered_op_nodes.append(node)
-        ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
-        ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id())
-        for op_node in ordered_op_nodes:
+                serial_ordered_op_nodes.append(node)
+        serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        num_nodes_before = len(serial_ordered_tensor_nodes) + len(
+            serial_ordered_op_nodes)
+
+        new_serial_ordered_tensor_nodes = []
+        new_serial_ordered_op_nodes = []
+        for op_node in serial_ordered_op_nodes:
             tensor_nodes = []
             for tensor_node in op_node.inputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
             tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
             self._serial_ordered_nodes.append(op_node)
+            new_serial_ordered_op_nodes.append(op_node)
             tensor_nodes = []
             for tensor_node in op_node.outputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
+            tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
-        num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes)
-        assert len(self._serial_ordered_nodes) == num_nodes_before, \
-            "The number of nodes before ordering is not the same after ordering."
+        new_serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        new_serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
+        self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
+        assert len(self._serial_ordered_nodes) == len(
+            self._serial_ordered_tensor_nodes) + len(
+                self._serial_ordered_op_nodes)
+        self._serial_orphan_tensor_nodes = []
+        for tensor_node in serial_ordered_tensor_nodes:
+            if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
+                self._serial_orphan_tensor_nodes.append(tensor_node)
+        if len(self._serial_ordered_nodes) != num_nodes_before:
+            print(
+                "WARNING: there are some orphan tensors or ops which are not used in the execution."
+            )
 
     def init_dist_attr_for_graph(self):
         assert self._is_initialized_for_program, \
@@ -352,9 +386,9 @@ class DistributedContext:
         if self._is_initialized_for_graph:
             return
         # Convert program to graph
+        set_flags({"FLAGS_convert_all_blocks": True})
         self._serial_graph = framework.IrGraph(
             core.Graph(self._serial_program.desc))
-        all_nodes = self._serial_graph.all_nodes()
         self.order_nodes_by_program_order()
         for node in self.serial_ordered_nodes:
             if node.is_var() and node.var() is not None:
@@ -365,10 +399,11 @@ class DistributedContext:
                     if tensor_id == cur_tensor_id \
                         or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
                         dist_tensor = cur_dist_tensor
-                        self._node_id_to_tensor_id[node.id()] = cur_tensor_id
+                        self._node_id_to_tensor_id[_node_id(
+                            node)] = cur_tensor_id
                 assert dist_tensor is not None, \
                     "Tensor must have a distributed tensor after the initialization for program."
-                serial_tensor_node_id = node.id()
+                serial_tensor_node_id = _node_id(node)
                 new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
                                                     dist_tensor.dist_attr)
                 self._dist_tensors_for_graph[
@@ -381,10 +416,10 @@ class DistributedContext:
                     if op_id == cur_op_id \
                         or op_id == cur_dist_op.serial_op.desc.original_id():
                         dist_op = cur_dist_op
-                        self._node_id_to_op_id[node.id()] = cur_op_id
+                        self._node_id_to_op_id[_node_id(node)] = cur_op_id
                 assert dist_op is not None, \
                     "Operator must have a distributed operator after the initialization for program."
-                serial_op_node_id = node.id()
+                serial_op_node_id = _node_id(node)
                 new_dist_op = DistributedOperator(dist_op.serial_op,
                                                   dist_op.dist_attr)
                 self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
@@ -402,10 +437,11 @@ class DistributedContext:
         assert self._is_initialized_for_program and self._is_initialized_for_graph, \
             "Both program and graph must be initialized."
         updated_tensors = {}
-        all_nodes = self._serial_graph.all_nodes()
+        # all_nodes = self._serial_graph.all_nodes()
+        all_nodes = self._serial_ordered_nodes
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                tensor_id = self._node_id_to_tensor_id[node.id()]
+                tensor_id = self._node_id_to_tensor_id[_node_id(node)]
                 updated = updated_tensors.get(tensor_id, False)
                 # If a var has multiples var nodes in graph, only use the first one for now
                 if not updated:
@@ -416,16 +452,31 @@ class DistributedContext:
                     dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
                     updated_tensors[tensor_id] = True
             if node.is_op() and node.op() is not None:
-                op_id = self._node_id_to_op_id[node.id()]
+                op_id = self._node_id_to_op_id[_node_id(node)]
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
+        # TODO: the completion algorithm will skip orphan tensors, 
+        # here we just set there process_mesh to the first one.
+        for orphan_node in self._serial_orphan_tensor_nodes:
+            serial_tensor_id = orphan_node.var().id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
+            else:
+                serial_tensor_id = orphan_node.var().original_id()
+                dist_tensor = self._dist_tensors_for_program.get(
+                    serial_tensor_id, None)
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
 
     def amend_dist_attr_for_program(self):
         for dist_tensor in self._dist_tensors_for_program.values():
             serial_tensor = dist_tensor.serial_tensor
             dist_attr = dist_tensor.dist_attr
-            if serial_tensor.type == core.VarDesc.VarType.READER:
+            if serial_tensor.type == core.VarDesc.VarType.READER \
+                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = serial_tensor.shape
@@ -446,6 +497,7 @@ class DistributedContext:
                     tensor_shape = []
                 else:
                     if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
+                        or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
                         or dist_op.serial_op.type == "create_py_reader":
                         tensor_shape = []
                     else:
@@ -459,8 +511,9 @@ class DistributedContext:
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
-                if dist_op.get_serial_output(
-                        arg_name).type == core.VarDesc.VarType.READER:
+                if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
                     tensor_shape = []
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
@@ -498,7 +551,8 @@ class DistributedContext:
         for k, v in self.__dict__.items():
             if k == "_serial_program" or k == "_serial_graph" \
                 or k == "_dist_main_programs" or k == "_dist_startup_programs" \
-                or k == "_serial_ordered_nodes":
+                or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \
+                or k == "_serial_ordered_op_nodes":
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 67de298564afc8caddad90d228131f1795f5707e..a2c2748a8cea390003dfec857a252b7df3ee1b05 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -76,7 +76,8 @@ class DistributedOperator:
             if tensor is None:
                 tensor_shape = []
             else:
-                if tensor.type == core.VarDesc.VarType.READER:
+                if tensor.type == core.VarDesc.VarType.READER \
+                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
                     tensor_shape = []
                 else:
                     tensor_shape = tensor.shape
@@ -86,7 +87,9 @@ class DistributedOperator:
                                                        tensor_dims_mapping)
         for tensor_name in self._serial_op.output_arg_names:
             tensor = self._serial_op.block._var_recursive(tensor_name)
-            if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if tensor.type == core.VarDesc.VarType.READER \
+                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = tensor.shape
@@ -95,6 +98,8 @@ class DistributedOperator:
                 tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
                 self._dist_attr.set_output_dims_mapping(tensor_name,
                                                         tensor_dims_mapping)
+        if self._dist_attr.op_type is None:
+            self._dist_attr.op_type = self.serial_op.type
         if self._dist_attr.impl_type is None:
             self._dist_attr.impl_type = "default"
         if self._dist_attr.impl_idx is None:
@@ -134,12 +139,16 @@ class DistributedOperator:
         return new_dist_attr
 
     def validate_dist_attr(self):
-        if "read" in self.serial_op.type:
+        if "read" in self.serial_op.type or "while" == self.serial_op.type:
             return True
         for name in self.serial_op.input_arg_names:
             input_dist_attr = self.dist_attr.get_input_dist_attr(name)
             dims_mapping = input_dist_attr.dims_mapping
-            shape = self.get_serial_input(name).shape
+            if self.get_serial_input(
+                    name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                shape = []
+            else:
+                shape = self.get_serial_input(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -155,7 +164,11 @@ class DistributedOperator:
         for name in self.serial_op.output_arg_names:
             output_dist_attr = self.dist_attr.get_output_dist_attr(name)
             dims_mapping = output_dist_attr.dims_mapping
-            shape = self.get_serial_output(name).shape
+            if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
+                or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
+                shape = []
+            else:
+                shape = self.get_serial_output(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -241,14 +254,14 @@ class DistributedModule:
 
     def __call__(self, *args, **kwargs):
         from .dist_context import get_default_distributed_context
-        main_prog = paddle.fluid.default_main_program()
-        main_block = main_prog.global_block()
-        op_size = len(main_block.ops)
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        op_size = len(cur_block.ops)
         output = self._serial_module(*args, **kwargs)
-        new_op_size = len(main_block.ops)
+        new_op_size = len(cur_block.ops)
         default_dist_ctx = get_default_distributed_context()
         for idx in range(op_size, new_op_size):
-            op = main_block.ops[idx]
+            op = cur_block.ops[idx]
             dist_op = DistributedOperator(op, self._dist_attr)
             dist_op.dist_attr.mark_annotated_as(self._dist_attr)
             default_dist_ctx.add_dist_op_for_program(dist_op)
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index 5e3c852699ab6f8dcb92b386989338e5ca3d2c1f..a42ce863492b3511e0e7ddfaa3a04b67f57e1157 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -184,7 +184,9 @@ class DistributedTensor:
 
     def _init_default_dist_attr(self):
         if self._dist_attr.dims_mapping is None:
-            if self.serial_tensor.type == core.VarDesc.VarType.READER:
+            if self.serial_tensor.type == core.VarDesc.VarType.READER \
+                or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = self._serial_tensor.shape
@@ -192,7 +194,9 @@ class DistributedTensor:
             self._dist_attr.dims_mapping = tensor_dims_mapping
 
     def validate_dist_attr(self):
-        if self.serial_tensor.type == core.VarDesc.VarType.READER:
+        if self.serial_tensor.type == core.VarDesc.VarType.READER \
+            or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
             return True
         tensor_shape = self.serial_tensor.shape
         if len(tensor_shape) != len(self.dist_attr.dims_mapping):
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 56beb8957415d3c3c401fdbf754cb17fc5e253a7..6bd1c5527a99e73ddcde1ada5f2a5a496c0d9933 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -259,7 +259,7 @@ class Engine:
                     "train_" + name: val
                     for name, val in logs.items()
                 }
-                self._logger.info(logs)
+                self._logger.info(train_logs)
 
     def _train_step(self, data):
         logs = {}
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 4b079e7b6b575a6bcfd372782529ccc2958cf5db..47f76353e465529f1d29a05852a952d151c76c93 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute
 
 _g_distributed_operator_impl_containers = {}
 
-_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"]
+_g_elementwise_ops = [
+    "elementwise_add", "gelu", "dropout", "cast", "gather", "concat"
+]
 BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 4e977007261a73e9b24a051f84e6e30f2bf9d860..de6d018d60521564ebc98b8df03e4b1356b846c8 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
         op_dist_attr = dist_op.dist_attr
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if len(dims_mapping) > 1:
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
@@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
             xshape_arg_names = op_desc.output("XShape")
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if arg_name not in xshape_arg_names:
                 if len(dims_mapping) > 1:
                     for mapping in dims_mapping[1:]:
@@ -104,7 +114,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
                         return False
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
 
         # Check output compatibility
         output_names = op_desc.output_names()
@@ -121,7 +132,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                     for mapping in dims_mapping[1:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 if dims_mapping[0] != -1:
                     return False
@@ -129,7 +141,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                     for mapping in dims_mapping[2:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[1])
+                if len(dims_mapping) >= 2:
+                    batch_dim_mappings.append(dims_mapping[1])
 
         # Check batch dim mapping compatibility
         if not all(batch_dim_mappings[0] == dim_mapping
@@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
         # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" or op_desc.type() == "slice":
+        if op_desc.type() == "shape" \
+            or op_desc.type() == "slice" \
+                or op_desc.type() == "while":
             return False
         output_names = op_desc.output_names()
         xshape_arg_names = []
@@ -155,17 +170,22 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 batch_dim_mappings.append(dims_mapping[1])
 
+        if not batch_dim_mappings:
+            return changed
+
         compatible_dim_mapping = compute_compatible_dim_mapping(
             batch_dim_mappings)
         assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
@@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if compatible_dim_mapping != dims_mapping[0]:
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                 dims_mapping[0] = compatible_dim_mapping
                 changed = True
         for arg_name in op_desc.output_arg_names():
@@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                if compatible_dim_mapping != dims_mapping[0]:
+                if len(dims_mapping
+                       ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                     dims_mapping[0] = compatible_dim_mapping
                     changed = True
             else:
-                if compatible_dim_mapping != dims_mapping[1]:
+                if len(dims_mapping
+                       ) >= 2 and compatible_dim_mapping != dims_mapping[1]:
                     dims_mapping[1] = compatible_dim_mapping
                     changed = True
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 058ae1d0a9fd5c25ec83ea15ed9c2e479322957c..c92142cf7384d2b0c76c1a5cb3b4e6ac257303a2 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
         if is_valid_list_index(y_dims_mapping,
                                -2) and is_dim_shard(y_dims_mapping[-2]):
             return False
-
         return True
 
     def is_output_compatible(self, dist_op):
diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..513558501a0eb218b772a8c02142d3c320675710
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py
new file mode 100644
index 0000000000000000000000000000000000000000..140336566a146776f805f7b546fe6bb39c267861
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class MetricRecord(object):
+    """
+    One record for a single metric at a given execution step.
+    """
+
+    def __init__(self, value, step):
+        self._value = value
+        self._step = step
+
+    @property
+    def value(self):
+        return self._value
+
+    @value.setter
+    def value(self, value):
+        self._value = value
+
+    @property
+    def step(self):
+        return self._step
+
+    @step.setter
+    def step(self, step):
+        self._step = step
+
+    def mean(self):
+        return np.mean(self.value)
+
+    def get_state(self):
+        return {"value": self.value, "step": self.step}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+    def __eq__(self, other):
+        if not isinstance(other, MetricRecord):
+            return False
+        return other.value == self.value and other.step == self.step
+
+    def __repr__(self):
+        return "MetricRecord(value={}, step={})".format(self.value, self.step)
+
+
+class MetricRecords(object):
+    """
+    Records of a single metric across different executions.
+    """
+
+    def __init__(self, direction="min"):
+        if direction not in {"min", "max"}:
+            raise ValueError(
+                "direction should be one of {min, max}, but got: {}.".format(
+                    direction))
+        self._direction = direction
+        self._records = {}
+
+    @property
+    def records(self):
+        return sorted(self._records.values(), key=lambda r: r.step)
+
+    @records.setter
+    def records(self, records):
+        for r in records:
+            self.update(r.value, step=r.step)
+
+    @property
+    def direction(self):
+        return self._direction
+
+    @direction.setter
+    def direction(self, direction):
+        self._direction = direction
+
+    def update(self, value, step=0):
+        if step in self._records:
+            self._records[step].set_value(value)
+        else:
+            self._records[step] = MetricRecord(value, step=step)
+
+    def get_best_value(self):
+        values = list(r.mean() for r in self._records.values())
+        if not values:
+            return None
+        if self._direction == "min":
+            return np.nanmin(values)
+        return np.nanmax(values)
+
+    def get_best_step(self):
+        best_value = self.get_best_value()
+        if best_value is None:
+            return None
+        for r in self._records.values():
+            if r.mean() == best_value:
+                return r.step
+
+    def get_statistics(self):
+        records = self.records
+        records_values = [r.mean() for r in records]
+        if not len(records_values):
+            return {}
+        return {
+            "min": float(np.nanmin(records_values)),
+            "max": float(np.nanmax(records_values)),
+            "mean": float(np.nanmean(records_values)),
+            "median": float(np.nanmedian(records_values)),
+            "var": float(np.nanvar(records_values)),
+            "std": float(np.nanstd(records_values)),
+        }
+
+    def get_state(self):
+        state = {}
+        state["direction"] = self._direction
+        state["records"] = [r.get_state() for r in self.records]
+        return state
+
+    @classmethod
+    def from_state(cls, state):
+        records = cls(state["direction"])
+        records.records = [MetricRecord.from_state(r) for r in state["records"]]
+        print("here 1", records.records)
+        return records
+
+
+class MetricsRecorder(object):
+    """
+    Record the values for all metrics.
+    """
+
+    def __init__(self, metrics=None):
+        self._records = {}
+        self.register_metrics(metrics)
+
+    @property
+    def records(self):
+        return self._records
+
+    def exists(self, name):
+        return name in self._records
+
+    def register_metrics(self, metrics=None):
+        metrics = metrics or []
+        for metric in metrics:
+            self.register(metric.name)
+
+    def register(self, name, direction=None):
+        if self.exists(name):
+            raise ValueError("Metric {} have been registered.".format(name))
+        if direction is None:
+            direction = "min"
+        self._records[name] = MetricRecords(direction)
+
+    def update(self, name, value, step=0):
+        value = float(value)
+        if not self.exists(name):
+            self.register(name)
+
+        prev_best = self._records[name].get_best_value()
+        self._records[name].update(value, step=step)
+        new_best = self._records[name].get_best_value()
+
+        improved = new_best != prev_best
+        return improved
+
+    def get_records(self, name):
+        return self._records[name].records
+
+    def set_records(self, name, records):
+        if not self.exists(name):
+            self.register(name)
+        self._records[name].records = records
+
+    def get_best_value(self, name):
+        return self._records[name].get_best_value()
+
+    def get_best_step(self, name):
+        return self._records[name].get_best_step()
+
+    def get_statistics(self, name):
+        return self._records[name].get_statistics()
+
+    def get_state(self):
+        return {
+            "metrics": {
+                name: metric_records.get_state()
+                for name, metric_records in self._records.items()
+            }
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        recorder = cls()
+        recorder._records = {
+            name: MetricRecords.from_state(metric_records)
+            for name, metric_records in state["metrics"].items()
+        }
+        return recorder
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
new file mode 100644
index 0000000000000000000000000000000000000000..d61e53a02724088c89f2e8cfafc91ca0047aa967
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+
+class Storable(object):
+    def get_state(self):
+        raise NotImplementedError
+
+    def set_state(self, state):
+        raise NotImplementedError
+
+    def save(self, path):
+        state = self.get_state()
+        state_json = json.dumps(state)
+        with open(path, "w") as f:
+            f.write(state_json)
+        return str(path)
+
+    def load(self, path):
+        with open(path, "r") as f:
+            state_data = f.read()
+        state = json.loads(state_data)
+        self.set_state(state)
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a6638c5ca63b953dcac3d62c564acf6087a305
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import random
+import time
+from enum import Enum
+
+from .storable import Storable
+from .recorder import MetricsRecorder
+from .tunable_space import TunableSpace
+
+
+class TrialStatus:
+    RUNNING = "RUNNING"
+    COMPLETED = "COMPLETED"
+    STOPPED = "STOPPED"
+    INVALID = "INVALID"
+
+
+class Trial(Storable):
+    def __init__(self, tunable_space, trial_id=None,
+                 status=TrialStatus.RUNNING):
+        self._id = _generate_trial_id() if trial_id is None else trial_id
+        self._space = tunable_space
+        self._recorder = MetricsRecorder()
+        self._score = None
+        self._best_step = None
+        self._status = status
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def space(self):
+        return self._space
+
+    @property
+    def recorder(self):
+        return self._recorder
+
+    @property
+    def score(self):
+        return self._score
+
+    @score.setter
+    def score(self, score):
+        self._score = score
+
+    @property
+    def best_step(self):
+        return self._best_step
+
+    @best_step.setter
+    def best_step(self, best_step):
+        self._best_step = best_step
+
+    @property
+    def status(self):
+        return self._status
+
+    @status.setter
+    def status(self, status):
+        self._status = status
+
+    def summary(self):
+        print("Tunable space:")
+        if self.space.values:
+            for tv, value in self.space.values.items():
+                print(tv + ":", value)
+
+        if self.score is not None:
+            print("Score: {}".format(self.score))
+
+    def get_state(self):
+        return {
+            "id": self.id,
+            "space": self.space.get_state(),
+            "recorder": self.recorder.get_state(),
+            "score": self.score,
+            "best_step": self.best_step,
+            "status": self.status,
+        }
+
+    def set_state(self, state):
+        self._id = state["id"]
+        self._space = TunableSpace.from_state(state["space"])
+        self._recorder = MetricsRecorder.from_state(state["recorder"])
+        self._score = state["score"]
+        self._best_step = state["best_step"]
+        self._status = state["status"]
+
+    @classmethod
+    def from_state(cls, state):
+        trial = cls(tunable_space=None)
+        trial.set_state(state)
+        return trial
+
+
+def _generate_trial_id():
+    s = str(time.time()) + str(random.randint(1, int(1e7)))
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32]
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63364c5b75ef03a81d8b293515f3bc5a55fce78
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import copy
+import math
+import random
+import numpy as np
+
+from .tunable_variable import Boolean
+from .tunable_variable import Fixed
+from .tunable_variable import Choice
+from .tunable_variable import IntRange
+from .tunable_variable import FloatRange
+
+
+class TunableSpace(object):
+    """
+    A TunableSpace is constructed by the tunable variables.
+    """
+
+    def __init__(self):
+        # Tunable variables for this tunable variables
+        self._variables = {}
+        # Specific values coresponding to each tunable variable
+        self._values = {}
+
+    @property
+    def variables(self):
+        return self._variables
+
+    @property
+    def values(self):
+        return self._values
+
+    def get_value(self, name):
+        if name in self.values:
+            return self.values[name]
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def set_value(self, name, value):
+        if name in self.values:
+            self.values[name] = value
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def _exists(self, name):
+        if name in self._variables:
+            return True
+        return False
+
+    def _retrieve(self, tv):
+        tv = tv.__class__.from_state(tv.get_state())
+        if self._exists(tv.name):
+            return self.get_value(tv.name)
+        return self._register(tv)
+
+    def _register(self, tv):
+        self._variables[tv.name] = tv
+        if tv.name not in self.values:
+            self.values[tv.name] = tv.default
+        return self.values[tv.name]
+
+    def __getitem__(self, name):
+        return self.get_value(name)
+
+    def __setitem__(self, name, value):
+        self.set_value(name, value)
+
+    def __contains__(self, name):
+        try:
+            self.get_value(name)
+            return True
+        except (KeyError, ValueError):
+            return False
+
+    def fixed(self, name, default):
+        tv = Fixed(name=name, default=default)
+        return self._retrieve(tv)
+
+    def boolean(self, name, default=False):
+        tv = Boolean(name=name, default=default)
+        return self._retrieve(tv)
+
+    def choice(self, name, values, default=None):
+        tv = Choice(name=name, values=values, default=default)
+        return self._retrieve(tv)
+
+    def int_range(self, name, start, stop, step=1, default=None):
+        tv = IntRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def float_range(self, name, start, stop, step=None, default=None):
+        tv = FloatRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def get_state(self):
+        return {
+            "variables": [{
+                "class_name": v.__class__.__name__,
+                "state": v.get_state()
+            } for v in self._variables.values()],
+            "values": dict((k, v) for (k, v) in self.values.items())
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        ts = cls()
+        for v in state["variables"]:
+            v = _deserialize_tunable_variable(v)
+            ts._variables[v.name] = v
+        ts._values = dict((k, v) for (k, v) in state["values"].items())
+        return ts
+
+
+def _deserialize_tunable_variable(state):
+    classes = (Boolean, Fixed, Choice, IntRange, FloatRange)
+    cls_name_to_cls = {cls.__name__: cls for cls in classes}
+
+    if isinstance(state, classes):
+        return state
+
+    if (not isinstance(state, dict) or "class_name" not in state or
+            "state" not in state):
+        raise ValueError(
+            "Expect state to be a python dict containing class_name and state as keys, but found {}"
+            .format(state))
+
+    cls_name = state["class_name"]
+    cls = cls_name_to_cls[cls_name]
+    if cls is None:
+        raise ValueError("Unknown class name {}".format(cls_name))
+
+    cls_state = state["state"]
+    deserialized_object = cls.from_state(cls_state)
+    return deserialized_object
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..9549b44c48ecb0b04ac22fafa6dcf5b6ff9aa0ae
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -0,0 +1,242 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class TunableVariable(object):
+    """
+    Tunablevariable base class.
+    """
+
+    def __init__(self, name, default=None):
+        self.name = name
+        self._default = default
+
+    @property
+    def default(self):
+        return self._default
+
+    def get_state(self):
+        return {"name": self.name, "default": self.default}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+
+class Fixed(TunableVariable):
+    """
+    Fixed variable which cannot be changed.
+    """
+
+    def __init__(self, name, default):
+        super(Fixed, self).__init__(name=name, default=default)
+        self.name = name
+        if not isinstance(default, (str, int, float, bool)):
+            raise ValueError(
+                "Fixed must be an str, int, float or bool, but found {}"
+                .format(default))
+        self._default = default
+
+    def random(self, seed=None):
+        return self._default
+
+    def __repr__(self):
+        return "Fixed(name: {}, value: {})".format(self.name, self.default)
+
+
+class Boolean(TunableVariable):
+    """
+    Choice between True and False.
+    """
+
+    def __init__(self, name, default=False):
+        super(Boolean, self).__init__(name=name, default=default)
+        if default not in {True, False}:
+            raise ValueError(
+                "default must be a Python boolean, but got {}".format(default))
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice((True, False))
+
+    def __repr__(self):
+        return 'Boolean(name: "{}", default: {})'.format(self.name,
+                                                         self.default)
+
+
+class Choice(TunableVariable):
+    def __init__(self, name, values, default=None):
+        super(Choice, self).__init__(name=name, default=default)
+
+        types = set(type(v) for v in values)
+        if len(types) > 1:
+            raise TypeError(
+                "Choice can contain only one type of value, but found values: {} with types: {}."
+                .format(str(values), str(types)))
+
+        if isinstance(values[0], str):
+            values = [str(v) for v in values]
+            if default is not None:
+                default = str(default)
+        elif isinstance(values[0], int):
+            values = [int(v) for v in values]
+            if default is not None:
+                default = int(default)
+        elif isinstance(values[0], float):
+            values = [float(v) for v in values]
+            if default is not None:
+                default = float(default)
+        elif isinstance(values[0], bool):
+            values = [bool(v) for v in values]
+            if default is not None:
+                default = bool(default)
+        else:
+            raise TypeError(
+                "Choice can only contain str, int, float, or boll, but found: {} "
+                .format(str(values)))
+        self.values = values
+
+        if default is not None and default not in values:
+            raise ValueError(
+                "The default value should be one of the choices {}, but found {}".
+                format(values, default))
+        self._default = default
+
+    @property
+    def default(self):
+        if self._default is None:
+            if None in self.values:
+                return None
+            return self.values[0]
+        return self._default
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice(self.values)
+
+    def get_state(self):
+        state = super(Choice, self).get_state()
+        state["values"] = self.values
+        return state
+
+    def __repr__(self):
+        return 'Choice(name: "{}", values: {}, default: {})'.format(
+            self.name, self.values, self.default)
+
+
+class IntRange(TunableVariable):
+    """
+    Integer range.
+    """
+
+    def __init__(self, name, start, stop, step=1, default=None, endpoint=False):
+        super(IntRange, self).__init__(name=name, default=default)
+        self.start = self._check_int(start)
+        self.stop = self._check_int(stop)
+        self.step = self._check_int(step)
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return int(value)
+
+    def get_state(self):
+        state = super(IntRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["default"] = self._default
+        return state
+
+    def _check_int(self, val):
+        int_val = int(val)
+        if int_val != val:
+            raise ValueError("Expects val is an int, but found: {}.".format(
+                str(val)))
+        return int_val
+
+    def __repr__(self):
+        return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format(
+            self.name, self.start, self.stop, self.step, self.default)
+
+
+class FloatRange(TunableVariable):
+    """
+    Float range.
+    """
+
+    def __init__(self,
+                 name,
+                 start,
+                 stop,
+                 step=None,
+                 default=None,
+                 endpoint=False):
+        super(FloatRange, self).__init__(name=name, default=default)
+        self.stop = float(stop)
+        self.start = float(start)
+        if step is not None:
+            self.step = float(step)
+        else:
+            self.step = None
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return value
+
+    def get_state(self):
+        state = super(FloatRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["endpoint"] = self.endpoint
+        return state
+
+    def __repr__(self):
+        return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format(
+            self.name, self.start, self.stop, self.step, self.default,
+            self.endpoint)
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 241eadcbace22cf36504e2c0ed36566fa94b9e4b..86c274cb45cc323dab60968571837e82619e6987 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
         used_dist_context._dist_op_context = DistributedOperatorContext()
         _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
             rank_id, used_dist_context)
-        # print("dist_main_program: ", dist_main_program)
         all_dist_main_program.append(dist_main_program)
 
     return all_dist_main_program
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index b42f21989abd77679993a1c8b52681351e4dfb40..1a3a8a4883d8beb84181609740d2b836f548bc2c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -900,11 +900,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
 
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
-        # now only Momentum and adam are compatible with sharding
-        # support EMA optimizer
+        # now only Momentum and adam are compatible with sharding,
+        # support EMA optimizer with '_ema_0',
+        # support offload with '@offload_0' and '.cast_fp16'
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0", "_ema_0"
+            "_velocity_0", "_ema_0", "@offload_0", ".cast_fp16"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 9886ca4e2deace4c625ead51852841e7c761be21..f96273cc84caf46f4f02c62e648ce70445b52d28 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -912,7 +912,6 @@ def _device2cpu(trans_param, convert_dtype=False):
 
 def _cpu2device(param):
     tmp_p = param.fw_storage.cuda(DEV_ID)
-    param.fw_storage._clear()
     if tmp_p.dtype == Type.fp32.value and param2dtype[
             param.name] == Type.fp16.value:
         tmp_p = paddle.cast(tmp_p, Type.fp16.value)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 89b59254e5b9105a55c68f3ef871396de1bd9199..6a30276e02ba238a0f4ee838164a5bf9976f7d84 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -89,7 +89,7 @@ class ShardingClipGrad:
             global_norm_fp16 = paddle.cast(
                 global_norm_fp16, dtype=paddle.float32)
 
-        # global norm of non-distributed FP16 params_and_grads for slice parameter
+        # global norm of non-distributed FP16 params_and_grads for unslice parameter
         if len(unslice_params_fp16) == 0:
             global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
         else:
@@ -104,21 +104,20 @@ class ShardingClipGrad:
                 [0.], dtype=paddle.float32)
         global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
 
-        # global norm of non-distributed FP32 params_and_grads for slice parameter
+        # global norm of non-distributed FP32 params_and_grads for unslice parameter
         global_unslice_fp32 = layers.concat(unslice_params_fp32) if len(
             unslice_params_fp32) != 0 else paddle.to_tensor(
                 [0.], dtype=paddle.float32)
         global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32)
         global_unslice_var = global_unslice_fp16 + global_unslice_fp32
 
-        global_norm_var = global_norm_fp16 + global_norm_fp32
+        global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var
 
         # add all reduce to get global norm of distributed params_and_grads
         dev_id = int(self._device.split(":")[1])
         with device_guard(dev_id, "gpu"):
             paddle.distributed.all_reduce(global_norm_var, group=self._group)
 
-        global_norm_var += global_unslice_var
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386
--- /dev/null
+++ b/python/paddle/distributed/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386
--- /dev/null
+++ b/python/paddle/distributed/models/moe/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd98c64318c60e2e67af320c51b24e39a3132c43
--- /dev/null
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def _number_count(gate_idx, upper_range):
+    """
+    calculate the expert count according to the gate index.
+    Args:
+        gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64.
+        upper_range (int): The number of the experts.
+    Returns:
+        out (Tensor): The output expert count.
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+
+            gate_idx = [
+                [0, 2],
+                [0, 2]
+            ]
+            upper_range = 6
+            gate_idx = paddle.to_tensor(gate_idx, dtype="int32")
+            number_count = paddle.distributed.utils.number_count(gate_idx, upper_range)
+            print(number_count) # the result: [2, 0, 2, 0, 0, 0]
+    """
+    if in_dygraph_mode():
+        return core.ops.number_count(gate_idx, 'upper_range', upper_range)
+    else:
+        op_type = 'number_count'
+
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype)
+
+        helper.append_op(
+            type=op_type,
+            inputs={'gate_idx': gate_idx},
+            outputs={'Out': out},
+            attrs={'upper_range': upper_range})
+        return out
diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f25ddb794cc4d573429ac960e646bd8125c48d16
--- /dev/null
+++ b/python/paddle/distributed/run/__init__.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .job.container import Container
+from .job.pod import Pod
+from .job.job import Job
+from . import plugins
+
+#__all__ = [Container, Pod, Job]
+'''
+Paddle distribution training entry ``python -m paddle.distributed.run``.
+
+Help
+
+# for arg usage and explanation, try the following command
+# python -m paddle.distributed.run -h
+
+Collective Mode
+
+Case 1: 1 node
+
+use all visible devices
+# python -m paddle.distributed.run train.py
+
+use specified devices
+# python -m paddle.distributed.run --devices=0,1,2,3 train.py
+
+Case 2: multi-node, auto detect ip/port
+
+# python -m paddle.distributed.run --np 2 train.py
+# auto print following command
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py
+# the master ip must be one of the node and the port must available
+
+Parameter Server Mode
+
+Case 1.1: 1 node, 1 ps, 1 trainer
+
+# python -m paddle.distributed.run --mode ps train.py
+# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py
+
+Case 1.2: 1 node, 2 ps, 2 trainer
+
+# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py
+
+Case 2: 2 node, 2 ps, 2 trainer per node
+
+# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py
+# auto print following command
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# the master ip must be one of the node and the port must available
+
+Case 4: specified servers and trainers in each node
+
+python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
+
+
+Elastic Mode
+
+# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
+# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py
+
+# once the peer number changes between 2:3, the strategy holds
+
+'''
diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/run/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e32df59a328081e33aa86b42ed9b8e489ac399e8
--- /dev/null
+++ b/python/paddle/distributed/run/__main__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import Context
+from . import controllers
+
+# initialize the context to run
+ctx = Context()
+
+# initialize the selected controller
+c = controllers.init(ctx)
+
+# run the pods
+c.run()
+
+# manager or just wait pod
+c.finalize()
diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86dff0f1f8056e784268a6ef3a3ebabb44aa9c6d
--- /dev/null
+++ b/python/paddle/distributed/run/context/__init__.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, REMAINDER
+import os, copy
+
+from paddle.distributed.run import plugins
+
+from .node import Node
+from .status import Status
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        os.environ.pop('http_proxy', None)
+        os.environ.pop('https_proxy', None)
+
+        self.args = self.parse_args()
+        self.envs = self.fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def parse_args(self):
+        parser = ArgumentParser()
+
+        base_group = parser.add_argument_group("Base Parameters")
+
+        base_group.add_argument(
+            "--master",
+            type=str,
+            default=None,
+            help="the master/rendezvous server, ip:port")
+
+        base_group.add_argument(
+            "--rank", type=int, default=-1, help="the peer rank")
+
+        base_group.add_argument(
+            "--log", type=str, default="INFO", help="log level. Default INFO")
+
+        base_group.add_argument(
+            "--np",
+            type=str,
+            default="1",
+            help="the number of peers, i.e. pod/node number")
+
+        base_group.add_argument(
+            "--nproc_per_node",
+            type=int,
+            default=None,
+            help="the number of processes in a pod")
+
+        base_group.add_argument(
+            "--log_dir",
+            type=str,
+            default="log",
+            help="the path for each process's log. Default ./log")
+        base_group.add_argument(
+            "--mode",
+            type=str,
+            default="collective",
+            help="run mode of the job, collective/ps/ps-heter")
+
+        base_group.add_argument(
+            "--id",
+            type=str,
+            default="default",
+            help="unique id of the job. Default default")
+
+        base_group.add_argument(
+            "--devices",
+            type=str,
+            default=None,
+            help="accelerate devices. as --gpus,npus,xps")
+
+        base_group.add_argument(
+            "--host", type=str, default=None, help="host ip")
+
+        base_group.add_argument(
+            "training_script",
+            type=str,
+            help="the full path of py script,"
+            "followed by arguments for the "
+            "training script")
+
+        base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+        ps_group = parser.add_argument_group("Parameter-Server Parameters")
+        # for parameter server
+        ps_group.add_argument(
+            "--servers",
+            type=str,
+            default='',
+            help="servers endpoints full list")
+        ps_group.add_argument(
+            "--trainers",
+            type=str,
+            default='',
+            help="trainers endpoints full list")
+
+        ps_group.add_argument(
+            "--trainer_num", type=int, default=None, help="number of trainers")
+        ps_group.add_argument(
+            "--server_num", type=int, default=None, help="number of servers")
+        ps_group.add_argument(
+            "--gloo_port", type=int, default=6767, help="gloo http port")
+        ps_group.add_argument(
+            "--with_gloo", type=str, default="0", help="use gloo or not")
+
+        # parameter elastic mode
+        elastic_group = parser.add_argument_group("Elastic Parameters")
+        elastic_group.add_argument(
+            "--max_restart",
+            type=int,
+            default=3,
+            help="the times can restart. Default 3")
+
+        elastic_group.add_argument(
+            "--elastic_level",
+            type=int,
+            default=-1,
+            help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+        )
+
+        elastic_group.add_argument(
+            "--elastic_timeout",
+            type=int,
+            default=30,
+            help="seconds to wait before elastic perform training")
+        return parser.parse_args()
+
+    def _valide_env(self, key):
+        if key in ['POD_IP']:
+            return True
+        if key.endswith('_VISIBLE_DEVICES'):
+            return True
+        if key.startswith('PADDLE_'):
+            return True
+
+        return False
+
+    def fetch_envs(self):
+        ge = os.environ.copy()
+
+        black_env_list = ['http_proxy', 'https_proxy']
+        for key in black_env_list:
+            ge.pop(key, None)
+
+        return ge
+        '''
+        # use black list instead white list
+        return {k: ge[k] for k in ge if self._valide_env(k)}
+        '''
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("PADDLERUN")
+        logger.setLevel(self.args.log.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def set_env_in_args(self):
+        env_args = {
+            'POD_IP': 'host',
+            'PADDLE_MASTER': 'master',
+            'PADDLE_DEVICES': 'devices',
+            'PADDLE_NP': 'np',
+            'PADDLE_MODE': 'mode',
+            'PADDLE_LOG': 'log',
+            'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+            'PADDLE_JOB_ID': 'id',
+            'PADDLE_RANK': 'rank',
+            'PADDLE_LOG_DIR': 'log_dir',
+            'PADDLE_MAX_RESTlRT': 'max_restart',
+            'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+            'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+            'PADDLE_SERVER_NUM': 'server_num',
+            'PADDLE_TRAINER_NUM': 'trainer_num',
+            'PADDLE_SERVERS_ENDPOINTS': 'servers',
+            'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+            'PADDLE_GLOO_PORT': 'gloo_port',
+            'PADDLE_WITH_GLOO': 'with_gloo',
+        }
+
+        for k, v in env_args.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8bbd851ccf83a1ebfac60758576384bbe1aa4f4
--- /dev/null
+++ b/python/paddle/distributed/run/context/device.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class DeviceType:
+    CPU = 'cpu'
+    GPU = 'gpu'
+    XPU = 'xpu'
+    NPU = 'npu'
+
+
+class Device(object):
+    def __init__(self, dtype=None, count=1, memory="", labels=""):
+        self.dtype = dtype
+        self.count = count
+        self.memory = memory
+        self.labels = labels
+
+    def __str__(self):
+        return ",".join(self.labels)
+
+    @classmethod
+    def parse_device(self):
+        dev = Device()
+        visible_devices = None
+        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif 'XPU_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.NPU
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+
+        if visible_devices and visible_devices != 'all':
+            dev.labels = visible_devices.split(',')
+            dev.count = len(dev.labels)
+        else:
+            return self.detect_device()
+
+        return dev
+
+    @classmethod
+    def detect_device(self):
+        import paddle.fluid as fluid
+
+        dev = Device()
+        num = 0
+        visible_devices = None
+        if fluid.core.is_compiled_with_cuda():
+            dev.dtype = DeviceType.GPU
+            num = fluid.core.get_cuda_device_count()
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_xpu():
+            dev.dtype = DeviceType.XPU
+            num = fluid.core.get_xpu_device_count()
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_npu():
+            dev.dtype = DeviceType.NPU
+            num = fluid.core.get_npu_device_count()
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+
+        if num == 0:
+            dev.dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all" or visible_devices == "":
+            dev.labels = [str(x) for x in range(0, num)]
+            dev.count = num
+        else:
+            dev.labels = visible_devices.split(',')
+            dev.count = len(dev.labels)
+
+        return dev
diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/run/context/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e8e7a5014002b480b717623dec2d5ee62eb743
--- /dev/null
+++ b/python/paddle/distributed/run/context/event.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Event(object):
+    def __init__(self, kind="status", message="", fatal=False):
+        self.kind = kind
+        self.message = message
+        self.fatal = fatal
diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/run/context/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ece4db0fbbeed379c2cda343022dd371a9e7540
--- /dev/null
+++ b/python/paddle/distributed/run/context/node.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device import Device
+
+import socket
+import struct
+from contextlib import closing
+
+
+class Node(object):
+    def __init__(self):
+        # self.device = Device.detect_device()
+        self.device = Device.parse_device()
+        self.ip = self.get_host_ip()
+        self.free_ports = []
+
+    def get_host_ip(self):
+        try:
+            self.hostname = socket.gethostname()
+            self.ip = socket.gethostbyname(socket.getfqdn(self.hostname))
+            return self.ip
+        except:
+            return '127.0.0.1'
+
+    def get_free_ports(self, n=1):
+        free_ports = [self.get_free_port() for i in range(n)]
+        self.free_ports += free_ports
+        return free_ports
+
+    def get_ports_occupied(self):
+        return self.free_ports
+
+    @classmethod
+    def get_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    @classmethod
+    def is_server_ready(self, ip, port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            #sock.settimeout(0.01)
+            #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if hasattr(socket, 'SO_REUSEPORT'):
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+            result = sock.connect_ex((ip, int(port)))
+            if result == 0:
+                return True
+            else:
+                return False
diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/run/context/resource.py
new file mode 100644
index 0000000000000000000000000000000000000000..faffed704c1f078f9fed131ef1ade98add60b5d9
--- /dev/null
+++ b/python/paddle/distributed/run/context/resource.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Resource(object):
+    def __init__(self):
+        self.devices = []
diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/run/context/status.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbf3623ec22ed56b5ce136d8a6813291be69e8f
--- /dev/null
+++ b/python/paddle/distributed/run/context/status.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
+    DONE = "done"  # should exit whatever status
+
+    def __init__(self):
+        self._current_status = None
+
+    def current(self):
+        return self._current_status
+
+    def is_running(self):
+        return self._current_status == self.RUNNING
+
+    def is_restarting(self):
+        return self._current_status == self.RESTARTING
+
+    def is_done(self):
+        if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]:
+            return True
+        else:
+            return False
+
+    def run(self):
+        self._current_status = self.RUNNING
+
+    def fail(self):
+        self._current_status = self.FAILED
+
+    def complete(self):
+        self._current_status = self.COMPLETED
+
+    def restart(self):
+        self._current_status = self.RESTARTING
+
+    def done(self):
+        self._current_status = self.DONE
diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/run/controllers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5557151ad5489cb4af0c34b3ad47c31774b3326
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["init"]
+
+from .collective import CollectiveController
+from .collective import CollectiveElasticController
+from .ps import PSController
+
+# the order is extremely important
+_controllers = [
+    CollectiveElasticController,
+    PSController,
+    CollectiveController,
+]
+
+
+def init(ctx):
+    for c in _controllers:
+        if c.enable(ctx):
+            return c(ctx)
diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/run/controllers/collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4feb54428a07265693c0969e6e385a380e22f3d
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/collective.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller
+
+import json
+import os
+import six
+import time
+
+
+class CollectiveController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        self.pod.replicas = self.pod_replicas()
+
+        # rank will be reset when restart
+        self.pod.rank = self.ctx.args.rank
+
+        port = self.ctx.node.get_free_port()
+
+        # compatible
+        endpoints = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(self.pod.replicas)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'replicas': self.pod.replicas,
+            'dtype': self.ctx.node.device.dtype,
+            'candidate': '{}:{}'.format(self.ctx.node.ip, port),
+            'endpoints': ",".join(endpoints),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+        self.pod.rank = rank
+
+        if len(peer_list) < 1:
+            return False
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+        self.save_pod_log(peer_list)
+
+        global_size = sum([i['replicas'] for i in peer_list])
+        rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+        '''
+        The new designed collective need nothing but a master endpoint
+        '''
+        collective_master = peer_list[0]['candidate']
+
+        job_endpoints = [i['endpoints'] for i in peer_list]
+
+        self.pod.reset()
+        for i in range(self.pod.replicas):
+            e = {
+                "PADDLE_MASTER": collective_master,
+                "PADDLE_GLOBAL_SIZE": "{}".format(global_size),
+                "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
+                "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
+                "PADDLE_LOCAL_RANK": "{}".format(i),
+                ## compatible env
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
+                "PADDLE_CURRENT_ENDPOINT": endpoints[i],
+                "PADDLE_TRAINER_ID": "{}".format(i + rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
+                "PADDLE_RANK_IN_NODE": str(i),
+            }
+            self.add_container(envs=e, log_tag=i)
+
+        return True
+
+
+class CollectiveElasticController(CollectiveController):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith("etcd://"):
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def register(self):
+        if self.job.id == 'default':
+            self.ctx.logger.warning(
+                'Using default job name may cause conflict, add --id in args')
+
+        self.master.register_heartbeat(self.job.id, self.pod.name)
+
+    def watch(self) -> bool:
+        '''
+        watch self and peer status, return true to exit
+        '''
+        while not self.ctx.status.is_done():
+            # self status
+            status = self.pod.watch(timeout=2)
+            self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
+                status, self.ctx.status.current()))
+
+            # completed
+            if status == self.ctx.status.COMPLETED:
+                self.master.set_status(status)
+                self.ctx.status.complete()
+                self.ctx.logger.info("Pod complete {}".format(status))
+                return True
+
+            # self failure
+            elif status == self.ctx.status.FAILED:
+                self.master.set_status(status)
+                self.master.restart_peer()
+                self.ctx.logger.info("Pod failed {}".format(status))
+                self.pod.stop()
+
+                if self.ctx.args.elastic_level <= 0:
+                    return True
+                else:
+                    return False
+
+            # peer failure
+            if self.ctx.status.is_restarting() and self.master.get_status(
+            ) != self.ctx.status.COMPLETED:
+                self.pod.stop()
+                return False
+
+            #peers = self.master.fetch_peer_alive()
+            #print("peers {}".format(peers))
+
+    def run(self):
+
+        timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
+        self.register()
+
+        while self.pod.restart <= self.ctx.args.max_restart:
+
+            self.build_job()
+
+            ok, replicas = self.master.wait_peer_ready(
+                self.job.replicas_min, self.job.replicas_max, timeout)
+            if ok:
+                self.job.replicas = replicas
+            else:
+                self.ctx.logger.warnning("peer not ready {}".format(self.job))
+                break
+
+            self.ctx.logger.debug("Run {}".format(self.job))
+
+            if not self.build_pod():
+                continue
+
+            self.master.set_status(self.ctx.status.RUNNING)
+            self.ctx.status.run()
+
+            assert len(self.pod.containers) > 0, "No container in the pod"
+            self.ctx.logger.debug("Run {}".format(self.pod))
+            self.ctx.logger.debug("Run {}".format(self.pod.containers[0]))
+
+            self.pod.deploy()
+
+            if self.watch():
+                break
+
+        self.ctx.logger.debug("Job done {}".format(self.job))
diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/run/controllers/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d904cf2a2cca5b9abaab06d1545c03c160e3d93
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/controller.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import signal
+
+from paddle.distributed.run.job import Job
+from paddle.distributed.run.job import Pod
+from paddle.distributed.run.job import Container
+
+from .master import Master
+
+import time
+
+
+class ControleMode:
+    COLLECTIVE = "collective"
+    PS = "ps"
+
+
+class ControllerBase(object):
+    def __init__(self, ctx):
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        signal.signal(signal.SIGABRT, self.signal_handler)
+        signal.signal(signal.SIGINT, self.signal_handler)
+
+        self.ctx = ctx
+        self.master = Master.factory(self.ctx)
+
+        self.job = Job(np=self.ctx.args.np,
+                       mode=self.ctx.args.mode,
+                       id=self.ctx.args.id)
+        self.pod = Pod()
+
+        self.join_server = None
+
+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        if len(self.pod.containers) < 1:
+            self.ctx.logger.error("No container in the pod {}".format(self.pod))
+            return
+
+        self.ctx.logger.info("Run {}".format(self.pod))
+        self.ctx.logger.debug(self.pod.containers[0])
+
+        self.pod.deploy()
+
+        self.watch()
+
+    def watch(self) -> bool:
+        status = self.pod.watch()
+
+        if status == self.ctx.status.COMPLETED:
+            self.ctx.logger.info("Pod {}".format(status))
+        elif status == self.ctx.status.FAILED:
+            self.ctx.logger.info("Pod {}".format(status))
+            self.ctx.logger.error("Container failed !!!\n{}".format(
+                self.pod.failed_container()))
+            self.pod.tail()
+            self.pod.stop()
+
+    def stop(self, sigint=None):
+        self.ctx.logger.debug("Controller stop")
+        self.master.stop()
+        self.pod.stop(sigint)
+
+    def finalize(self):
+        self.pod.join()
+        self.master.stop()
+
+        self.ctx.logger.info("Exit code {}".format(self.pod.exit_code))
+        sys.exit(self.pod.exit_code)
+
+    def signal_handler(self, sigint, frame):
+        self.ctx.logger.info("Terminating with signal {}".format(sigint))
+
+        if hasattr(self, 'sigint'):
+            time.sleep(5)
+            sys.exit(sigint)
+
+        self.sigint = sigint
+        self.ctx.status.done()
+        self.stop(sigint)
+        time.sleep(1)
+        self.ctx.logger.debug("Exit with signal {}".format(sigint))
+        sys.exit(sigint)
+
+
+class Controller(ControllerBase):
+    '''
+    Controller API for customization
+    '''
+
+    def build_job(self):
+        '''
+        build job fill the job info.
+        '''
+        self.ctx.logger.info(self.job)
+
+    def build_pod(self) -> bool:
+        '''
+        build pod includes creating containers etc.
+
+        Return True if succeed
+        '''
+        raise NotImplementedError
+
+    def _get_entrypoint(self):
+        entrypoint = [sys.executable, "-u", self.ctx.args.training_script]
+        entrypoint.extend(self.ctx.args.training_script_args)
+        return entrypoint
+
+    def _get_out_err_file(self, out=None, err=None):
+        if out and self.ctx.args.log_dir != "":
+            out = os.path.join(self.ctx.args.log_dir, out)
+        if err and self.ctx.args.log_dir != "":
+            err = os.path.join(self.ctx.args.log_dir, err)
+        return out, (err or out)
+
+    def new_container(self,
+                      entrypoint=None,
+                      envs={},
+                      use_ctx_env=True,
+                      out=None,
+                      err=None):
+        c = Container(
+            entrypoint=(entrypoint or self._get_entrypoint()),
+            env=(self.ctx.get_envs() if use_ctx_env else {}), )
+        c.outfile, c.errfile = self._get_out_err_file(out, err)
+        c.update_env(envs)
+        return c
+
+    def add_container(self,
+                      container=None,
+                      entrypoint=None,
+                      envs={},
+                      log_tag=None,
+                      is_init=False):
+        if not is_init and log_tag is not None:
+            log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name,
+                                             log_tag)
+        else:
+            log_file = None
+
+        if not container:
+            container = self.new_container(
+                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file)
+
+        if is_init:
+            self.pod.add_init_container(container)
+        else:
+            self.pod.add_container(container)
+
+    def pod_replicas(self):
+        '''
+        how many process/container should be run in pod
+        '''
+
+        if self.ctx.args.nproc_per_node:
+            return int(self.ctx.args.nproc_per_node)
+        else:
+            return self.ctx.node.device.count
+
+    def save_pod_log(self, info):
+        '''
+        save_pod_log append *info* to the log file of pod.name
+        '''
+        if not self.ctx.args.log_dir:
+            return
+
+        f = os.path.join(self.ctx.args.log_dir,
+                         '{}.{}.log'.format(self.job.id, self.pod.name))
+        try:
+            os.makedirs(os.path.dirname(f), exist_ok=True)
+            with open(f, 'a+') as fd:
+                fd.write(str(info))
+        except Exception as e:
+            self.ctx.logger.error("save log failed because {}".format(e))
diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/run/controllers/master.py
new file mode 100644
index 0000000000000000000000000000000000000000..257ba3bad8da3c331ac303b7a3ee415461fd13b8
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/master.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.run.utils.kv_client import KVClient
+from paddle.distributed.run.utils.kv_server import KVServer
+
+import time
+import sys
+import six
+import threading
+import copy
+import random
+
+ETCD_PROTOCAL = 'etcd://'
+
+
+class Master(object):
+    '''
+    Master is a distributed store design to exchange info among nodes
+    '''
+
+    MAIN = "main"
+    STANDBY = "standby"
+    PATICIPANT = "participant"
+
+    def __init__(self, ctx):
+        self.ctx = ctx
+        self.server = None
+        self.initialized = False
+        self.endpoint = None
+
+    def stop(self):
+        raise NotImplementedError
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        raise NotImplementedError
+
+    @classmethod
+    def factory(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL):
+            return ETCDMaster(ctx)
+        else:
+            return HTTPMaster(ctx)
+
+
+class HTTPMaster(Master):
+    def lazy_init(self):
+        if self.initialized:
+            return
+
+        self.role = Master.PATICIPANT
+
+        if self.ctx.args.master:
+            self.endpoint = self.ctx.args.master
+            ip, port = self.endpoint.split(':')
+            if ip in ['127.0.0.1', self.ctx.node.ip]:
+                time.sleep(2 * random.random())
+                while not self.ctx.node.is_server_ready(ip, int(port)):
+                    try:
+                        self.server = KVServer(int(port))
+                        self.role = Master.MAIN
+                        break
+                    except Exception as e:
+                        self.ctx.logger.warning("start master failed {}".format(
+                            e))
+                        time.sleep(0.1)
+                        continue
+        else:
+            port = self.ctx.node.get_free_port()
+            self.endpoint = "{}:{}".format(self.ctx.node.ip, port)
+            self.server = KVServer(port)
+            self.role = Master.MAIN
+
+            print("Copy the following command to other nodes to run.")
+            cmd = [
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.run"
+            ]
+            cmd.extend(["--master", self.endpoint])
+            cmd.extend(sys.argv[1:])
+            print("-" * 80)
+            print(" ".join(cmd))
+            print("-" * 80)
+
+            if self.ctx.args.rank >= 0:
+                self.ctx.logger.warning(
+                    "--rank set in the command may not compatible in auto mode")
+
+        if '127.0.0.1' in self.endpoint:
+            self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip)
+        self.client = KVClient(self.endpoint)
+
+        self.initialized = True
+
+        self._start_server()
+
+    def _start_server(self):
+        if self.server and not self.server.started:
+            self.server.start()
+            self.ctx.logger.debug("KV server start at {}".format(self.endpoint))
+
+    def _stop_server(self):
+        if self.server and not self.server.stopped:
+            self.server.stop()
+            self.ctx.logger.debug("KV server stopped")
+
+    def stop(self):
+        self._stop_server()
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        if size < 2:
+            return [value], 0
+
+        self.lazy_init()
+
+        while not self.ctx.status.is_done():
+            if self.client.wait_server_ready(timeout=5):
+                break
+            else:
+                self.ctx.logger.warning("master not ready")
+                time.sleep(0.1)
+
+        # 'aaaaaa' make suer main pod (master server) as rank 0
+        ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
+        k = "{}/{}/{}".format(prefix, ky, rank)
+
+        while not self.ctx.status.is_done():
+            if not self.client.put(k, value):
+                self.ctx.logger.warning("put value failed")
+                time.sleep(0.1)
+                continue
+
+            rjson = self.client.get_prefix(prefix)
+            self.ctx.logger.debug("sync peers {}".format(rjson))
+            if rjson and len(rjson) == size:
+                if rank < 0:
+                    keys = list(rjson.keys())
+                    keys.sort()
+                    ret = [rjson[k] for k in keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for k, v in rjson.items():
+                        ret[int(k.split('/')[-1])] = v
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+        return [], 0
+
+
+class ETCDMaster(Master):
+    def __init__(self, ctx):
+        super().__init__(ctx)
+
+        if self.ctx.args.master:
+            # etcd://localhost:2379
+            self.endpoint = self.ctx.args.master.strip("etcd://")
+
+        import etcd3
+
+        host, port = self.endpoint.split(':')
+        self.client = etcd3.client(host=host, port=port)
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        '''
+        sync_peers gather all value for key under scope prefix
+        result always be sorted either by rank or alphabet of pod.name
+        '''
+        path = "{}/{}/{}".format(prefix, key, rank)
+
+        self.client.delete_prefix(prefix)
+
+        self.ctx.logger.debug("sync path {} value {}".format(path, value))
+
+        while not self.ctx.status.is_done():
+            self.client.put(path, six.b(value))
+
+            result = [i for i in self.client.get_prefix(prefix)]
+            result = copy.deepcopy(result)
+            self.ctx.logger.debug("sync peers {}".format(result))
+
+            if len(result) == size:
+                if rank < 0:
+                    keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys.sort()
+                    values = [six.ensure_str(i[0]) for i in result]
+                    ret = [values[keys.index(k)] for k in sorted_keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for v, k in result:
+                        ii = int(six.ensure_str(k.key).split('/')[-1])
+                        if ii < 0:
+                            self.ctx.logger.error(
+                                "rank {} error in sync".format(ii))
+                        ret[ii] = six.ensure_str(v)
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+
+    def register_heartbeat(self, job_id, pod_id, ttl=10):
+        if hasattr(self, 'heartbeat_prefix'):
+            self.ctx.logger.warning("Heartbeat already done")
+            return
+
+        self.job_prefix = '/paddle/{}'.format(job_id)
+        self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix)
+
+        lease = self.client.lease(ttl)
+
+        #self.client.delete_prefix(self.job_prefix)
+
+        beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id)
+        self.client.put(beat_path, six.b(pod_id), lease=lease)
+
+        def _beat_watch(event):
+            self.ctx.status.restart()
+
+        beat_watch = self.client.add_watch_prefix_callback(
+            self.heartbeat_prefix, _beat_watch)
+
+        def _heartbeat():
+            while not self.ctx.status.is_done():
+                try:
+                    lease.refresh()
+                    if pod_id not in self.fetch_peer_alive():
+                        self.client.put(beat_path, six.b(pod_id), lease=lease)
+                        self.ctx.logger.debug("Heartbeat register again")
+                except Exception as e:
+                    self.ctx.logger.error("Heartbeat error {}".format(e))
+                time.sleep(ttl / 2)
+            self.ctx.logger.debug("Heartbeat done")
+            self.client.cancel_watch(beat_watch)
+
+        self.beat_thread = threading.Thread(
+            name='heartbeat', target=_heartbeat, daemon=True)
+        self.beat_thread.start()
+
+    def fetch_peer_alive(self):
+        peer_alive = [
+            six.ensure_str(i[0])
+            for i in self.client.get_prefix(self.heartbeat_prefix)
+        ]
+        self.ctx.logger.debug("peer alive {}".format(peer_alive))
+        return peer_alive
+
+    def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        end = time.time() + timeout
+        while not self.ctx.status.is_done() and time.time() < end:
+            if len(self.fetch_peer_alive()) == replicas_max:
+                return (True, replicas_max)
+            else:
+                time.sleep(0.5)
+
+        np = len(self.fetch_peer_alive())
+        if np >= replicas_min and np <= replicas_max:
+            return (True, np)
+        else:
+            return (False, np)
+
+    def restart_peer(self):
+        self.client.delete_prefix(self.heartbeat_prefix)
+
+    def set_status(self, status):
+        assert self.client.put(
+            self.job_prefix, six.b(status),
+            lease=self.client.lease(600)), "set status failed {}".format(status)
+
+    def get_status(self):
+        return six.ensure_str(self.client.get(self.job_prefix)[0] or '')
+
+    def stop(self):
+        if hasattr(self, 'beat_thread'):
+            self.ctx.status.done()
+            # TODO(kuizhiqing) thread should exit
+            #self.beat_thread.join()
diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/run/controllers/ps.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc43c336cf1862fe075e5a4463b1f5b666a5005c
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/ps.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller, ControleMode
+
+import json
+import os, shutil
+
+
+class PSController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
+                ctx.args.servers) > 0:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.mode = ControleMode.PS
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        if self.ctx.args.servers and self.ctx.args.trainers:
+            self._build_pod_with_args()
+        else:
+            self._build_pod_with_master()
+
+    def _build_pod_with_args(self):
+        if '127.0.0.1' in self.ctx.args.servers:
+            host = '127.0.0.1'
+        else:
+            host = self.ctx.node.ip
+
+        server_endpoints = [s for s in self.ctx.args.servers.split(",")]
+        trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")]
+        servers = [
+            s for s in self.ctx.args.servers.split(",") if s.startswith(host)
+        ]
+        trainers = [
+            s for s in self.ctx.args.trainers.split(",") if s.startswith(host)
+        ]
+        server_num = len(servers)
+        trainer_num = len(trainers)
+
+        self.pod.replicas = server_num + trainer_num
+
+        self.save_pod_log([server_endpoints, trainer_endpoints])
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = self.ctx.args.gloo_port
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers,
+                "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers,
+                "PADDLE_PORT": servers[i].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        trainer_rank_offset = 0
+        for s in trainer_endpoints:
+            if s.startswith(host):
+                break
+            else:
+                trainer_rank_offset += 1
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT": trainers[i].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+    def _build_pod_with_master(self):
+
+        self.pod.rank = self.ctx.args.rank
+
+        server_num = self.ctx.args.server_num or 1
+        servers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(server_num)
+        ]
+        trainer_num = self.ctx.args.trainer_num or 1
+        trainers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(trainer_num)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'servers': servers,
+            'trainers': trainers,
+            'dtype': self.ctx.node.device.dtype,
+            'gloo_port': self.ctx.node.get_free_port(),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.save_pod_log(peer_list)
+
+        server_endpoints = [j for i in peer_list for j in i['servers']]
+        trainer_endpoints = [j for i in peer_list for j in i['trainers']]
+        #rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+
+        server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]])
+        trainer_rank_offset = sum(
+            [len(i['trainers']) for i in peer_list[:rank]])
+
+        self.pod.rank = rank
+
+        self.pod.replicas = server_num + trainer_num
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = peer_list[0]['gloo_port']
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                server_endpoints[i + server_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                trainer_endpoints[i + trainer_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        ''' NEW VERSION
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "PSERVER",
+                "PADDLE_RANK": "{}".format(i + server_rank_offset),
+            }
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "TRAINER_CPU",
+                "PADDLE_RANK": "{}".format(i + trainer_rank_offset),
+            }
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        '''
diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/run/job/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d2abbce21ebc72cda5373a3d3a242c077beaa8
--- /dev/null
+++ b/python/paddle/distributed/run/job/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pod import Pod
+from .job import Job
+from .container import Container
+from .status import Status
+
+__all__ = [
+    'Pod',
+    'Job',
+    'Container',
+    'Status',
+]
diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/run/job/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..651932d6c88378034d7ab9cb05bac00ee3ea7ddf
--- /dev/null
+++ b/python/paddle/distributed/run/job/container.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from paddle.distributed.run.utils.process_context import ProcessContext
+
+from .status import Status
+
+import os, copy, sys
+import time
+
+
+class Container(object):
+    '''
+    TODO(kuizhiqing) A container can be run by process/thread or just a callable function
+    '''
+
+    def __init__(self, entrypoint=[], rank=-1, env={}):
+        self._entrypoint = entrypoint
+        self._rank = rank
+        self._out = None
+        self._err = None
+        self._env = env
+        self._proc = None
+
+        self._retry: int = 3
+        self._grace_period = 10
+
+        self._log_handler = None
+
+    @property
+    def entrypoint(self):
+        return self._entrypoint
+
+    @entrypoint.setter
+    def entrypoint(self, entry):
+        self._entrypoint = entry
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def outfile(self):
+        return self._out
+
+    @outfile.setter
+    def outfile(self, out):
+        self._out = out
+
+    @property
+    def errfile(self):
+        return self._err
+
+    @errfile.setter
+    def errfile(self, err):
+        self._err = err
+
+    def update_env(self, env={}, **kwargs):
+        env = {k: v for k, v in env.items() if isinstance(v, str)}
+        self._env.update(env)
+
+        kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
+        self._env.update(kwargs)
+
+    def _get_fd(self, pth):
+        if not pth:
+            return None
+
+        try:
+            d = os.path.dirname(pth)
+            if not os.path.isdir(d):
+                os.makedirs(d, exist_ok=True)
+            return open(pth, 'w')
+        except:
+            return None
+
+    def start(self, timeout=-1):
+        end = time.time() + timeout
+
+        if self._proc and self._proc.alive():
+            return True
+
+        self._stdout = self._get_fd(self._out) or sys.stdout
+        if self._out == self._err:
+            self._stderr = self._stdout
+        elif self._err:
+            self._stderr = self._get_fd(self._err) or sys.stderr
+
+        self._proc = ProcessContext(
+            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
+        self._proc.start()
+
+        while timeout > 0 and time.time() < end:
+            if self._proc.alive():
+                time.sleep(0.1)
+                continue
+            if self._proc.exit_code() == 0:
+                return True
+            return False
+
+    def terminate(self, force=False):
+        if self._log_handler:
+            self._log_handler.close()
+            self._log_handler = None
+
+        if self._proc and self._proc.alive():
+            return self._proc.terminate(force)
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
+
+    def exit_code(self):
+        return self._proc.exit_code() if self._proc else -1
+
+    def status(self):
+        if not self._proc:
+            return Status.UNINIT
+        if self._proc.alive():
+            return Status.RUNNING
+        elif self._proc.exit_code() == 0:
+            return Status.COMPLETED
+        else:
+            return Status.FAILED
+
+    def __str__(self):
+        return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
+            self._rank,
+            self.status(),
+            self._entrypoint,
+            self.exit_code(),
+            self.errfile,
+            self._env, )
+
+    def logs(self, fn=None, offset=0, whence=1, lines=1000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        if fn is None:
+            fn = sys.stdout
+
+        self._log_handler.seek(offset, whence)
+
+        try:
+            idx = 0
+            for line in self._log_handler:
+                fn.write(line)
+                idx += 1
+                if idx > lines:
+                    break
+        finally:
+            return self._log_handler.tell()
+
+    def tail(self, length=3000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        self._log_handler.seek(0, 2)
+        ed = self._log_handler.tell()
+
+        if ed > length:
+            self.logs(offset=ed - length, whence=0)
+        else:
+            self.logs(offset=0, whence=0)
diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/run/job/job.py
new file mode 100644
index 0000000000000000000000000000000000000000..3469ed862576faed3bd7546710927f638b8fe0d5
--- /dev/null
+++ b/python/paddle/distributed/run/job/job.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class JobMode:
+    COLLECTIVE = 'collective'
+    PS = 'ps'
+    HETER = 'heter'
+
+
+class Job(object):
+    def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"):
+        self._mode = mode
+        self._id = id
+
+        self._replicas = 0
+        self._replicas_min = self._replicas
+        self._replicas_max = self._replicas
+        self._elastic = False
+
+        self.set_replicas(str(np))
+
+    def __str__(self):
+        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
+            self.id, self.mode, self._replicas, self._replicas_min,
+            self._replicas_max, self.elastic)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def elastic(self):
+        return self._elastic
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @property
+    def replicas_min(self):
+        return self._replicas_min
+
+    @property
+    def replicas_max(self):
+        return self._replicas_max
+
+    @replicas.setter
+    def replicas(self, replicas):
+        self._replicas = replicas
+
+    def set_replicas(self, np: str):
+        np = str(np) if np else '1'
+
+        if ':' in np:
+            nps = np.split(':')
+            self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1])
+            self._replicas = self._replicas_max  # default to max
+
+            self._elastic = True
+        else:
+            self._replicas = int(np)
+            self._replicas_min, self._replicas_max = self._replicas, self._replicas
+
+            self._elastic = False
diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/run/job/pod.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c31edce1d552befac3a6f54e5e79c326b31c67
--- /dev/null
+++ b/python/paddle/distributed/run/job/pod.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .container import Container
+
+from .status import Status
+
+import random
+import time
+
+
+class PodSepc(object):
+    def __init__(self):
+        self._name = ''.join(
+            random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6))
+
+        # by controller
+        self._init_containers: List[Container] = []
+        self._containers: List[Container] = []
+
+        #self.resource: Resource = None
+        #self.status: Status = None
+
+        self._rank = -1
+        self._init_timeout = 120  # 2 min timeout for each init container
+        self._restart = -1
+        self._replicas = 0  # number of containers
+        self._exit_code = 0
+
+
+class Pod(PodSepc):
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        return "Pod: {}, replicas {}, status {}".format(self.name,
+                                                        self.replicas,
+                                                        self.status())
+
+    def failed_container(self):
+        for c in self._containers:
+            if c.status() == Status.FAILED:
+                return c
+        return None
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @replicas.setter
+    def replicas(self, r):
+        self._replicas = r
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def restart(self):
+        return self._restart
+
+    @property
+    def containers(self):
+        return self._containers
+
+    def add_container(self, c):
+        c.rank = len(self._containers)
+        self._containers.append(c)
+
+    @property
+    def init_containers(self):
+        return self._init_containers
+
+    def add_init_container(self, c):
+        c.rank = len(self._init_containers)
+        self._init_containers.append(c)
+
+    @property
+    def exit_code(self):
+        for c in self._containers:
+            if c.exit_code() != 0:
+                return c.exit_code()
+        return 0
+
+    def deploy(self):
+        for i in self._init_containers:
+            i.start(self._init_timeout)
+
+        for c in self._containers:
+            c.start()
+
+        self._restart += 1
+
+    def stop(self, sigint=0):
+        for c in self._containers:
+            force = True if sigint == 9 else False
+            c.terminate(force)
+
+    def join(self):
+        for c in self._containers:
+            c.wait(None)
+
+    def status(self):
+        if self.is_failed():
+            return Status.FAILED
+
+        if self.is_completed():
+            return Status.COMPLETED
+
+        return Status.READY
+
+    def reset(self):
+        self._init_containers = []
+        self._containers = []
+
+    def is_failed(self):
+        for c in self._containers:
+            if c.status() == Status.FAILED:
+                return True
+        return False
+
+    def is_completed(self):
+        for c in self._containers:
+            if c.status() != Status.COMPLETED:
+                return False
+        return True
+
+    def logs(self, idx=None):
+        if idx is None:
+            if self.failed_container():
+                self.failed_container().logs()
+            else:
+                self._containers[0].logs()
+        else:
+            self._containers[idx].logs()
+
+    def tail(self, idx=None):
+        if idx is None:
+            if self.failed_container():
+                self.failed_container().tail()
+            else:
+                self._containers[0].tail()
+        else:
+            self._containers[idx].tail()
+
+    def watch(self,
+              all_list=[Status.COMPLETED],
+              any_list=[Status.FAILED],
+              interval=1,
+              timeout=-1):
+        '''
+        watch return if any container status in any_list
+        or all container status in all_list
+        '''
+        end = time.time() + timeout
+        while timeout < 0 or time.time() < end:
+            for c in self._containers:
+                if c.status() in any_list:
+                    return c.status()
+
+            s = [c.status() for c in self._containers]
+            if len(set(s)) == 1 and s[0] in all_list:
+                return s[0]
+
+            time.sleep(interval)
diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/run/job/status.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae10c5adb6cbfe4713370a2f01c74569bfe98182
--- /dev/null
+++ b/python/paddle/distributed/run/job/status.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/run/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec91402a7aad359c9860cf737a78cb7c1f1375d1
--- /dev/null
+++ b/python/paddle/distributed/run/plugins/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+__all__ = []
+
+
+def log(ctx):
+    ctx.logger.info("-----------  Configuration  ----------------------")
+    for arg, value in sorted(six.iteritems(vars(ctx.args))):
+        ctx.logger.info("%s: %s" % (arg, value))
+    ctx.logger.info("--------------------------------------------------")
+
+
+def process_args(ctx):
+    # reset device by args
+    #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
+    argdev = ctx.args.devices
+    if argdev:
+        ctx.node.device.labels = argdev.split(',')
+        ctx.node.device.count = len(ctx.node.device.labels)
+        ctx.logger.debug('Device reset by args {}'.format(argdev))
+
+
+def collective_compatible(ctx):
+    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
+        ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0]
+    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
+        ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0]
+
+
+def rewrite_host_ip(ctx):
+    if ctx.args.host is not None and "." in ctx.args.host:
+        ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host))
+        ctx.node.ip = ctx.args.host
+
+
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py
new file mode 100644
index 0000000000000000000000000000000000000000..0809ed5864da9f3bea29235621e7c29b75823391
--- /dev/null
+++ b/python/paddle/distributed/run/plugins/ip.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import socket
+
+
+def get_local_ip(ctx):
+    _, ip = _get_host_name_ip()
+    ctx.args.host = ip
+    ctx.envs["POD_IP"] = ip
+
+
+def _get_host_name_ip():
+    try:
+        host_name = socket.gethostname()
+        host_ip = socket.gethostbyname(host_name)
+        return host_name, host_ip
+    except:
+        return None
diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/run/utils/kv_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19195412268a5c309d65a4a61e005ea512d685b
--- /dev/null
+++ b/python/paddle/distributed/run/utils/kv_client.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+
+
+class KVClient(object):
+    def __init__(self, endpoint='localhost:2379'):
+        self.endpoint = endpoint if endpoint.startswith(
+            "http://") else "http://{}".format(endpoint)
+
+    def put(self, key, value):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.post(u, data=value, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def get(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                ret = r.json()
+                return ret.get(key, '')
+            else:
+                return "error"
+        except:
+            return ""
+
+    def get_prefix(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                return r.json()
+        except:
+            return ""
+
+    def delete(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.delete(u, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def wait_server_ready(self, timeout=3):
+        end = time.time() + timeout
+        while time.time() < end:
+            if self.get("/healthy") == "ok":
+                return True
+
+
+if __name__ == '__main__':
+    cli = PKVClient("http://localhost:8090")
+    data = {"/workers/1": "rank1", "/workers/2": "rank2"}
+    for k, v in data.items():
+        cli.put(k, v)
+    x = cli.get_prefix("/workers")
+    print(x)
+    for k, v in data.items():
+        assert x[k] == v
+
+    cli.put("key", "value")
+    print(cli.get("key"))
+    assert cli.get("key") == "value"
+    cli.delete("key")
+    print(cli.get("/key"))
+    print(cli.get("/healthy"))
+    assert cli.get("/healthy") == "ok"
diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/run/utils/kv_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d7ae15f13d636f05536dbdd8f35434bb7c3bf97
--- /dev/null
+++ b/python/paddle/distributed/run/utils/kv_server.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from http.server import HTTPServer
+import http.server as SimpleHTTPServer
+
+from multiprocessing import Process
+
+import threading
+import json
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        with self.server.kv_lock:
+            ret = {}
+            for k, v in self.server.kv.items():
+                if k.startswith(self.path):
+                    ret[k] = v.decode(encoding="utf-8")
+            if ret:
+                self.output(200, json.dumps(ret).encode("utf-8"))
+            else:
+                self.output(404)
+
+    def do_PUT(self):
+        self.do_POST()
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'] or 0)
+        try:
+            value = self.rfile.read(content_length)
+            with self.server.kv_lock:
+                self.server.kv[self.path] = value
+                self.output(200)
+                return
+        except:
+            self.output(500)
+
+    def do_DELETE(self):
+        with self.server.kv_lock:
+            if self.path in self.server.kv:
+                del self.server.kv[self.path]
+                self.output(200)
+            else:
+                self.output(404)
+
+    def output(self, code, value=''):
+        self.send_response(code)
+        self.send_header("Content-Length", len(value))
+        self.send_header("Content-Type", "application/json; charset=utf8")
+        self.end_headers()
+        if value:
+            self.wfile.write(value)
+
+    def log_message(self, format, *args):
+        return
+
+
+class KVServer(HTTPServer, object):
+    def __init__(self, port):
+        super(KVServer, self).__init__(('', port), KVHandler)
+        self.kv_lock = threading.Lock()
+        self.kv = {'/healthy': b'ok'}
+        self.port = port
+        self.stopped = False
+        self.started = False
+
+    def start(self):
+        self.listen_thread = threading.Thread(target=self.serve_forever)
+        self.listen_thread.start()
+        self.started = True
+
+    def stop(self):
+        self.shutdown()
+        self.listen_thread.join()
+        self.server_close()
+        self.stopped = True
+
+
+class PKVServer():
+    def __init__(self, port):
+        self._server = KVServer(port)
+
+    def start(self):
+        self.proc = Process(target=self._server.start)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def stop(self):
+        self._server.stop()
+        self.proc.join()
+
+    @property
+    def started(self):
+        return self._server.started
+
+    @property
+    def stopped(self):
+        return self._server.stopped
+
+
+if __name__ == '__main__':
+    #kv = PKVServer(8090)
+    kv = KVServer(8090)
+    kv.start()
+    import time
+
+    #print("serve at 8090 for 600 s")
+
+    time.sleep(600)
diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/run/utils/process_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6fa8de794ff07874cc788f0abf0a283c066ae7
--- /dev/null
+++ b/python/paddle/distributed/run/utils/process_context.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os, sys, signal, time
+
+
+class ProcessContext(object):
+    def __init__(self,
+                 cmd,
+                 env=os.environ,
+                 out=sys.stdout,
+                 err=sys.stderr,
+                 group=True,
+                 preexec_fn=None):
+        self._cmd = cmd
+        self._env = env
+        self._preexec_fn = preexec_fn
+        self._stdout = out
+        self._stderr = err
+        self._group = group if os.name != 'nt' else False
+        self._proc = None
+        self._code = None
+
+    def _start(self):
+        pre_fn = os.setsid if self._group else None
+        self._proc = subprocess.Popen(
+            self._cmd,
+            env=self._env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            preexec_fn=self._preexec_fn or pre_fn)
+
+    def _close_std(self):
+        try:
+            if not self._stdout.isatty():
+                self._stdout.close()
+
+            if not self._stderr.isatty():
+                self._stderr.close()
+        except:
+            pass
+
+    def alive(self):
+        return self._proc and self._proc.poll() is None
+
+    def exit_code(self):
+        return self._proc.poll() if self._proc else None
+
+    def start(self):
+        self._start()
+
+    def terminate(self, force=False, max_retry=3):
+        for i in range(max_retry):
+            if self.alive():
+                if self._group:
+                    os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM)
+                else:
+                    self._proc.terminate()
+                time.sleep(0.2)
+            else:
+                break
+
+        if force and self.alive():
+            self._proc.kill()
+
+        self._close_std()
+
+        return self.alive()
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 2fdb20600f673b21e7cabd6ffe35c545b045bb5d..6fd4caa7b4a5c41e73fcf95ac50d0253bb3e7c79 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -39,19 +39,20 @@ def group_sharded_parallel(model,
                            segment_size=2**20,
                            sync_comm=False):
     """
-    Use this module to configure and wrap up the parameters of the group shared module.
+    Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
+    Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
 
     Args:
         model (Layer): The layer to be wrapped with group_sharded_parallel.
         optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
         level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
-        scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None.
-        group (Group, optional): The group instance. Defaults to None.d
-        offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False.
-        sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False.
-        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23.
-        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20.
-        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False.
+        scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used.
+        group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used.
+        offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
     
     Returns:
         model: A wrapper for group sharded given model.
@@ -101,7 +102,7 @@ def group_sharded_parallel(model,
     def check_dtype(param):
         return param.dtype == paddle.float16
 
-    params_fp16 = filter(check_dtype, model.parameters())
+    params_fp16 = list(filter(check_dtype, model.parameters()))
     if scaler is None and len(params_fp16) > 0:
         raise ValueError("Please enter the correct scaler.")
     # convert model/optimizer/scaler
@@ -146,10 +147,13 @@ def save_group_sharded_model(model, output, optimizer=None):
     """
     Group sharded encapsulated model and optimizer state saving module.
 
+    .. note::
+        If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.
+
     Args:
         model (Layer): A wrapper for group sharded given model.
         output (str): Save directory.
-        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
     
     Examples:
         .. code-block:: python
@@ -182,7 +186,7 @@ def save_group_sharded_model(model, output, optimizer=None):
             optimizer.clear_grad()
 
             # save model and optimizer state_dict
-            save_group_sharded_model(model, optimizer，output=output_dir)
+            save_group_sharded_model(model, optimizer, output=output_dir)
     """
     logger_.info(
         "==========Begin to save group sharded model and optimizer==========")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 997075590e5cf97241188b847c0c5b5036ecee59..fb9e8d8ece100baa3ed7c65a8dc495aa12c254ff 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -226,3 +226,7 @@ if core.is_compiled_with_npu():
     atexit.register(core.npu_finalize)
 # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
 atexit.register(core.clear_executor_cache)
+# NOTE(Aganlengzi): clean up KernelFactory in advance manually.
+atexit.register(core.clear_kernel_factory)
+# NOTE(wangran16): clean up DeviceManger in advance manually.
+atexit.register(core.clear_device_manager)
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 80d2ccb0d5ca6fcb3a802014a860bfb2ff9b3400..9dba5d658dfc9f480c5e668be2c34b2bcb673078 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -173,6 +173,9 @@ if core.is_compiled_with_xpu():
 elif core.is_compiled_with_npu():
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'NPU', core.VarDesc.VarType.FP16)
+elif core.is_compiled_with_mlu():
+    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+        'MLU', core.VarDesc.VarType.FP16)
 else:
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'GPU', core.VarDesc.VarType.FP16)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 97b4116826a2a0e809af4a4129a0e953fd607b07..d614630b3db12d47d7b6790e40edefbd0402e384 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -979,8 +979,6 @@ class PostTrainingQuantization(object):
                 if op.type in (
                         self._quantizable_op_type + self._out_scale_op_list):
                     out_var_names = _get_op_output_var_names(op)
-                    assert len(out_var_names) == 1, "Post training " + \
-                        "quantization only support one output for " + op.type
                     for var_name in out_var_names:
                         analysis_and_save_info(op, var_name)
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index efa000274d01a601d6949c16803020272da918b4..afca617b6dd82b9106dbbb4a28e89090b0ad5278 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -59,6 +59,7 @@ _out_scale_op_list = [
     "tanh",
     "prelu",
     "swish",
+    "dropout",
     "softmax",
     "batch_norm",
     "layer_norm",
@@ -68,6 +69,8 @@ _out_scale_op_list = [
     "transpose2",
     "concat",
     "elementwise_mul",
+    "elementwise_pow",
+    "elementwise_sub",
     "scale",
     "slice",
     "hard_swish",
@@ -81,8 +84,54 @@ _out_scale_op_list = [
     "flatten2",
     "transpose",
     "pad2d",
+    "pad3d",
     "reshape",
-    "layer_norm",
+    "split",
+    "flatten_contiguous_range",
+    "squeeze",
+    "squeeze2",
+    "nearest_interp_v2",
+    "fill_constant_batch_size_like",
+    "bilinear_interp",
+    "bilinear_interp_v2",
+    "arg_max",
+    "abs",
+    "assign",
+    "cast",
+    "clip",
+    "box_coder",
+    "crop",
+    "cumsum",
+    "equal",
+    "expand_v2",
+    "fill_any_like",
+    "fill_constant",
+    "gelu",
+    "instance_norm",
+    "lookup_table",
+    "lookup_table_v2",
+    "norm",
+    "p_norm",
+    "pow",
+    "reduce_mean",
+    "stack",
+    "top_k_v2",
+    "unsqueeze",
+    "unsqueeze2",
+    "logical_and",
+    "logical_not",
+    "meshgrid",
+    "roi_align",
+    "strided_slice",
+    "where",
+    "grid_sampler",
+    "tile",
+    "group_norm",
+    "reduce_sum",
+    "square",
+    "softplus",
+    "gather",
+    "shuffle_channel",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -119,7 +168,7 @@ _op_real_in_out_name = {
     "relu": [["X"], ["Out"]],
     "relu6": [["X"], ["Out"]],
     "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
+    "prelu": [["X", "Alpha"], ["Out"]],
     "tanh": [["X"], ["Out"]],
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
@@ -127,16 +176,59 @@ _op_real_in_out_name = {
     "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
+    "elementwise_pow": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
     "hard_swish": [["X"], ["Out"]],
     "hard_sigmoid": [["X"], ["Out"]],
     "gru": [["Input", "Weight"], ["Hidden"]],
     "lstm": [["Input", "Weight"], ["Hidden"]],
     "pad2d": [["X"], ["Out"]],
+    "pad3d": [["X"], ["Out"]],
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [['X'], ["Out"]],
+    "unsqueeze2": [["X"], ["Out"]],
+    "flatten_contiguous_range": [["X"], ["Out"]],
+    "split": [["X"], ["Out"]],
+    "squeeze2": [["X"], ["Out"]],
+    "nearest_interp_v2": [["X"], ["Out"]],
+    "bilinear_interp": [["X"], ["Out"]],
+    "bilinear_interp_v2": [["X"], ["Out"]],
+    "fill_constant_batch_size_like": [["Input"], ["Out"]],
+    "arg_max": [["X"], ["Out"]],
+    "abs": [["X"], ["Out"]],
+    "assign": [["X"], ["Out"]],
+    "cast": [["X"], ["Out"]],
+    "clip": [["X"], ["Out"]],
+    "box_coder": [["PriorBox"], ["OutputBox"]],
+    "crop": [["X"], ["Out"]],
+    "cumsum": [["X"], ["Out"]],
+    "expand_v2": [["X"], ["Out"]],
+    "fill_any_like": [["X"], ["Out"]],
+    "fill_constant": [[], ["Out"]],
+    "gelu": [["X"], ["Out"]],
+    "instance_norm": [["X"], ["Out"]],
+    "lookup_table": [["W", "Ids"], ["Out"]],
+    "lookup_table_v2": [["W", "Ids"], ["Out"]],
+    "norm": [["X"], ["Norm"]],
+    "p_norm": [["X"], ["Out"]],
+    "pow": [["X"], ["Out"]],
+    "reduce_mean": [["X"], ["Out"]],
+    "stack": [["X"], ["Y"]],
+    "top_k_v2": [["X"], ["Out", "Indices"]],
+    "logical_and": [["X", "Y"], ["Out"]],
+    "logical_not": [["X"], ["Out"]],
+    "meshgrid": [["X"], ["Out"]],
+    "roi_align": [["X", "ROIs"], ["Out"]],
+    "strided_slice": [["Input"], ["Out"]],
+    "where": [["Condition", "X", "Y"], ["Out"]],
+    "grid_sampler": [["X", "Grid"], ["Output"]],
+    "tile": [["X"], ["Out"]],
+    "group_norm": [["X"], ["Y", "Mean", "Variance"]],
+    "reduce_sum": [["X"], ["Out"]],
+    "square": [["X"], ["Out"]],
+    "softplus": [["X"], ["Out"]],
+    "shuffle_channel": [["X"], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
@@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object):
     quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
-        "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose",
-        "equal", "gather", "greater_equal", "greater_than", "less_equal",
-        "less_than", "mean", "not_equal", "reshape", "reshape2",
-        "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
-        "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
-        "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm",
-        "matmul_v2"
+        "pool2d",
+        "elementwise_add",
+        "concat",
+        "softmax",
+        "argmax",
+        "transpose",
+        "equal",
+        "gather",
+        "greater_equal",
+        "greater_than",
+        "less_equal",
+        "less_than",
+        "mean",
+        "not_equal",
+        "reshape",
+        "reshape2",
+        "dropout",
+        "bilinear_interp",
+        "nearest_interp",
+        "trilinear_interp",
+        "slice",
+        "squeeze",
+        "elementwise_sub",
+        "mul",
+        "matmul",
+        "relu",
+        "relu6",
+        "leaky_relu",
+        "tanh",
+        "swish",
+        "scale",
+        "transpose",
+        "transpose2",
+        "sigmoid",
+        "pad2d",
+        "flatten",
+        "flatten2",
+        "batch_norm",
+        "layer_norm",
+        "matmul_v2",
+        "split",
+        "flatten_contiguous_range",
+        "squeeze2",
+        "nearest_interp_v2",
+        "bilinear_interp",
+        "bilinear_interp_v2",
+        "fill_constant_batch_size_like",
+        "arg_max",
+        "abs",
+        "assign",
+        "cast",
+        "clip",
+        "box_coder",
+        "crop",
+        "cumsum",
+        "elementwise_mul",
+        "elementwise_pow",
+        "expand_v2",
+        "fill_any_like",
+        "fill_constant",
+        "gelu",
+        "hard_sigmoid",
+        "hard_swish",
+        "instance_norm",
+        "lookup_table",
+        "lookup_table_v2",
+        "norm",
+        "p_norm",
+        "pad3d",
+        "pow",
+        "prelu",
+        "reduce_mean",
+        "unsqueeze",
+        "unsqueeze2",
+        "logical_and",
+        "logical_not",
+        "meshgrid",
+        "roi_align",
+        "strided_slice",
+        "where",
+        "grid_sampler",
+        "tile",
+        "group_norm",
+        "reduce_sum",
+        "square",
+        "softplus",
+        "shuffle_channel",
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 3fadf25150f9ef3556a343fdce8acc24d788f5dc..f97c2778c0918ecbfbed546089c17e9d505818cd 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -52,6 +52,30 @@ def parse_args():
         '--debug',
         action='store_true',
         help='If used, the graph of Quant model is drawn.')
+    parser.add_argument(
+        '--quant_model_filename',
+        type=str,
+        default="",
+        help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
+    )
+    parser.add_argument(
+        '--quant_params_filename',
+        type=str,
+        default="",
+        help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
+    )
+    parser.add_argument(
+        '--save_model_filename',
+        type=str,
+        default="__model__",
+        help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
+    )
+    parser.add_argument(
+        '--save_params_filename',
+        type=str,
+        default=None,
+        help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
+    )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path,
                                   save_path,
                                   ops_to_quantize='',
                                   op_ids_to_skip='',
-                                  debug=False):
+                                  debug=False,
+                                  quant_model_filename='',
+                                  quant_params_filename='',
+                                  save_model_filename='',
+                                  save_params_filename=''):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
     with fluid.scope_guard(inference_scope):
-        if os.path.exists(os.path.join(original_path, '__model__')):
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe)
+        if not quant_model_filename:
+            if os.path.exists(os.path.join(original_path, '__model__')):
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(original_path,
+                                                                exe)
+            else:
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(
+                     original_path, exe, 'model', 'params')
         else:
             [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
-                                                            'model', 'params')
+             fetch_targets] = fluid.io.load_inference_model(
+                 original_path, exe, quant_model_filename,
+                 quant_params_filename)
 
         ops_to_quantize_set = set()
         print(ops_to_quantize)
@@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path,
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(save_path, feed_target_names,
-                                          fetch_targets, exe, inference_program)
+            fluid.io.save_inference_model(
+                save_path,
+                feed_target_names,
+                fetch_targets,
+                exe,
+                inference_program,
+                model_filename=save_model_filename,
+                params_filename=save_params_filename)
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n"
             .format(save_path))
@@ -109,4 +150,6 @@ if __name__ == '__main__':
     test_args, remaining_args = parse_args()
     transform_and_save_int8_model(
         test_args.quant_model_path, test_args.int8_model_save_path,
-        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug,
+        test_args.quant_model_filename, test_args.quant_params_filename,
+        test_args.save_model_filename, test_args.save_params_filename)
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index 9bf45f4272738c69073d252371b6a6c59aaf15da..ec288a1287119dd436a91843b863ba355bba28fb 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -29,10 +29,11 @@ from .asp import decorate
 from .asp import prune_model
 from .asp import set_excluded_layers
 from .asp import reset_excluded_layers
+from .supported_layer_list import add_supported_layer
 
 __all__ = [
     'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
     'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
     'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
-    'reset_excluded_layers'
+    'reset_excluded_layers', 'add_supported_layer'
 ]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index ffa12ac70460084fd49a14d0193be6e913495b9a..30439ad736d26f3086a7f87d591aa68a59b7baa8 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -23,6 +23,8 @@ import paddle
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
 from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
@@ -292,8 +294,8 @@ class ASPHelper(object):
     2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
     """
 
-    MASK_APPENDDED_NAME = '_asp_mask'
-    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+    MASK_APPENDDED_NAME = 'asp_mask'
+    PADDLE_WEIGHT_SUFFIX = "w_"
 
     __asp_info = {}
 
@@ -334,7 +336,6 @@ class ASPHelper(object):
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
-        checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
 
         if main_program is None:
             main_program = paddle.static.default_main_program()
@@ -345,33 +346,27 @@ class ASPHelper(object):
                 weight_tensor = global_scope().find_var(param.name).get_tensor()
                 weight_nparray = np.array(weight_tensor)
 
-                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
-                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
-                # cuSparseLt would prune matrix A along k dimension.
-                # In sparse training, layer weight matriices is viewed sparse matrix A, so
-                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
-                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
-                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
-                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
-                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
-                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
-                # sure its shape to be the same as the input weight.
-                weight_sparse_mask = sparsity.create_mask(
-                    weight_nparray.T, func_name=mask_algo, n=n, m=m).T
-                weight_pruned_nparray = np.multiply(weight_nparray,
-                                                    weight_sparse_mask)
+                prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                weight_pruned_nparray, weight_sparse_mask = \
+                    prune_func(weight_nparray, m, n, mask_algo, param.name)
+                weight_pruned_nparray = weight_pruned_nparray.astype(
+                    weight_nparray.dtype)
                 weight_tensor.set(weight_pruned_nparray, place)
-                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                        'Pruning {} weight matrix failure!!!'.format(param.name)
+
                 if with_mask:
                     weight_mask_param = global_scope().find_var(
                         ASPHelper._get_mask_name(param.name))
                     assert weight_mask_param is not None, \
-                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        'Cannot find {} variable, please call optimizer.minimize (' \
+                        'paddle.sparsity.decorate(optimizer).minimize(loss)' \
                         ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
                     weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_sparse_mask = weight_sparse_mask.astype(
+                        np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
+
         return asp_info.masks.copy()
 
     @staticmethod
@@ -384,7 +379,7 @@ class ASPHelper(object):
         Returns:
             string: The mask name of :attr:`param_name`.
         """
-        return param_name + ASPHelper.MASK_APPENDDED_NAME
+        return param_name + "." + ASPHelper.MASK_APPENDDED_NAME
 
     @staticmethod
     def _get_not_ASP_relevant_vars(main_program):
@@ -434,19 +429,46 @@ class ASPHelper(object):
               # fc_0.w_0 -> True
               # fc_0.b_0 -> False
         """
-        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+        param_name_list = param_name.split('.')
+
+        if ASPHelper.MASK_APPENDDED_NAME in param_name_list:
             return False
 
         for layer in cls._get_program_asp_info(main_program).excluded_layers:
             if layer in param_name:
                 return False
 
-        for name in ASPHelper.SUPPORTED_LAYERS:
-            if name in param_name and \
-               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
-                return True
+        if param_name in supported_layers_and_prune_func_map:
+            return True
+
+        param_name_no_weight_suffix = param_name_list[0]
+        param_type_suffix = param_name_list[1]
+        layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix.
+                                                 rfind('_')]
+        if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix:
+            return False
+
+        if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \
+            layer_name in supported_layers_and_prune_func_map:
+            return True
+
         return False
 
+    @classmethod
+    def _get_prune_func_by_name(cls, param_name):
+        func = supported_layers_and_prune_func_map.get(param_name, None)
+        param_name_no_weight_suffix = param_name.split('.')[0]
+        if func is None:
+            func = supported_layers_and_prune_func_map.get(
+                param_name_no_weight_suffix, None)
+        if func is None:
+            layer_name = param_name_no_weight_suffix[:
+                                                     param_name_no_weight_suffix.
+                                                     rfind('_')]
+            func = supported_layers_and_prune_func_map.get(layer_name,
+                                                           _default_pruning)
+        return func
+
     @classmethod
     def _minimize(cls,
                   optimizer,
@@ -509,8 +531,7 @@ class ASPHelper(object):
                 if ASPHelper._is_supported_layer(main_program,
                                                  param_and_grad[0].name):
                     mask_param = layers.create_parameter(
-                        name=param_and_grad[0].name +
-                        ASPHelper.MASK_APPENDDED_NAME,
+                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
                         shape=param_and_grad[0].shape,
                         dtype=param_and_grad[0].dtype,
                         default_initializer=ConstantInitializer(value=1.0))
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..105c2ded9eee71f68344d99dba325fee0a155850
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle.fluid.contrib import sparsity
+import threading
+
+__all__ = ['add_supported_layer']
+
+
+def _default_pruning(weight_nparray, m, n, func_name, param_name):
+
+    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+
+    # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+    # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+    # cuSparseLt would prune matrix A along k dimension.
+    # In sparse training, layer weight matrices is viewed sparse matrix A, so
+    # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+    # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+    # matrices beforce invoking create_mask. Then we transpose the result mask to make 
+    # sure its shape to be the same as the input weight.
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m).T
+    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
+    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    return weight_pruned_nparray, weight_sparse_mask
+
+
+# When value of given key in this DICT is None, 
+# ASP will call default pruning function in pruning stage.
+_supported_layers_and_prune_func_map_lock = threading.Lock()
+supported_layers_and_prune_func_map = {}
+
+
+def add_supported_layer(layer, pruning_func=None):
+    r"""
+    Add supported layers and its corresponding pruning function.
+
+    Args:
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
+        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+        its the corresponding pruning function.
+        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
+        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+        m, n, and func_name, please see `prune_model` for details.
+    """
+    name = None
+    if isinstance(layer, str):
+        name = layer
+    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            type(layer).__name__)
+    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            layer.__name__)
+    else:
+        assert "The type of layer should be string of Layer, but got {}!".format(
+            type(layer))
+    if pruning_func is None:
+        pruning_func = _default_pruning
+    _supported_layers_and_prune_func_map_lock.acquire()
+    supported_layers_and_prune_func_map.update({name: pruning_func})
+    _supported_layers_and_prune_func_map_lock.release()
+
+
+add_supported_layer('fc')
+add_supported_layer('linear')
+add_supported_layer('conv2d')
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 706ec0d523b938fda0501dfd04f1fc976bf6a26b..5385ac28b90f614fcd6003994b9a7000bc16702a 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                     self._rcvd_idx += 1
                     self._batches_outstanding -= 1
                 else:
+                    # NOTE: when _rcvd_idx catch up _send_idx, which means
+                    #       one of following:
+                    #       1. all 2 * num_workers batches have been loaded
+                    #          and stored in _blocking_queue
+                    #       2. all data drained
+                    #       we need to let _thread blocking at _data_queue
+                    #       get_data to inoccupy CPU, otherwise may occupy
+                    #       CPU time for model running
                     # NOTE: in persistent workers mode, do not check data
                     #       drained here, simply let it go to _data_queue
                     #       reading to get _ResumeIteration
@@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         #       may also be data in blocking queue
                         if self._batches_outstanding < len(self._places):
                             return None
-                        continue
 
             if self._rcvd_idx in self._task_infos and \
                     len(self._task_infos[self._rcvd_idx]) == 3:
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 191661b7bf9d5a5b2877f22ebe6d2ec9124f2f96..4127f1e4449bf82aae294ce952122f1f8f6e775f 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -271,18 +271,28 @@ def amp_guard(enable=True,
             "current_tracer is None, maybe it is not in imperative mode.")
 
     # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
     # Maybe we will support cpu for bfloat16.
     if enable and not (tracer._expected_place.is_gpu_place() or
-                       tracer._expected_place.is_xpu_place()):
+                       tracer._expected_place.is_xpu_place() or
+                       tracer._expected_place.is_mlu_place() or
+                       tracer._expected_place.is_npu_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
+    # For npu:
+    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
+        warnings.warn('NPUPlace only support float16 amp.')
+        enable = False
     # For xpu:
     if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
         warnings.warn('XPUPlace only support float16 amp.')
         enable = False
+    # For mlu:
+    if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
+        warnings.warn('MLUPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index f7c2d6be574c4e17fa5ce6fa44ca4ecc55a5eb95..c57290861942b8020f6f55792c445d42a0578c90 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -105,9 +105,11 @@ class AmpScaler(object):
                 "current_tracer is None, maybe it is not in imperative mode.")
 
         if enable and not (tracer._expected_place.is_gpu_place() or
-                           tracer._expected_place.is_xpu_place()):
+                           tracer._expected_place.is_xpu_place() or
+                           tracer._expected_place.is_mlu_place() or
+                           tracer._expected_place.is_npu_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 
@@ -286,14 +288,28 @@ class AmpScaler(object):
                     ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                            )
             ]
-        if len(param_grads_fp16):
-            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                            param_grads_fp16,
-                                            self._temp_found_inf_fp16)
-        if len(param_grads_fp32):
-            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                            param_grads_fp32,
-                                            self._temp_found_inf_fp32)
+        if core.is_compiled_with_npu():
+            float_status = _C_ops.alloc_float_status()
+            _C_ops.clear_float_status(float_status, float_status)
+
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                float_status, param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                float_status, param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+        else:
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+
         if len(param_grads_fp16) and len(param_grads_fp32):
             self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
         elif len(param_grads_fp16):
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8149d69d36a27fadcefa8dc6b6ff1dd89792e29e..9439982858530e1e81156be4b32ef2d91dc4a33a 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -565,16 +565,25 @@ def grad(outputs,
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} cannot be empty".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var,
-                    core.VarBase), "Elements of {} must be Variable".format(
-                        name)
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.
+                        Tensor), "Elements of {} must be Tensor".format(name)
+                else:
+                    assert isinstance(
+                        each_var,
+                        core.VarBase), "Elements of {} must be Variable".format(
+                            name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                core.VarBase), "{} must be Variable or list of Variable".format(
-                    name)
+            if core._in_eager_mode():
+                assert isinstance(
+                    in_out_list, core.eager.
+                    Tensor), "{} must be Tensor or list of Tensor".format(name)
+            else:
+                assert isinstance(
+                    in_out_list, core.VarBase
+                ), "{} must be Variable or list of Variable".format(name)
             return [in_out_list]
 
     outputs = check_in_out(outputs, 'outputs')
@@ -586,9 +595,14 @@ def grad(outputs,
 
         for each_var in grad_outputs:
             if each_var is not None:
-                assert isinstance(
-                    each_var, core.VarBase
-                ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.Tensor
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                else:
+                    assert isinstance(
+                        each_var, core.VarBase
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
     else:
         grad_outputs = []
 
@@ -600,14 +614,27 @@ def grad(outputs,
         no_grad_vars = []
     elif isinstance(no_grad_vars, core.VarBase):
         no_grad_vars = [no_grad_vars]
+    elif isinstance(no_grad_vars, core.eager.Tensor):
+        no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, (list, tuple, set)):
         no_grad_vars = list(no_grad_vars)
         for var in no_grad_vars:
-            assert isinstance(
-                var, core.VarBase), "no_grad_vars can only contains Variable"
+            if core._in_eager_mode():
+                assert isinstance(
+                    var,
+                    core.eager.Tensor), "no_grad_vars can only contains Tensor"
+            else:
+                assert isinstance(
+                    var,
+                    core.VarBase), "no_grad_vars can only contains Variable"
     else:
-        raise AssertionError(
-            "no_grad_vars must be None, Variable or list/tuple/set of Variables")
+        if core._in_eager_mode():
+            raise AssertionError(
+                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
+        else:
+            raise AssertionError(
+                "no_grad_vars must be None, Variable or list/tuple/set of Variables"
+            )
 
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
@@ -622,6 +649,11 @@ def grad(outputs,
     assert isinstance(only_inputs, bool), "only_inputs must be True or False"
     assert only_inputs, "only_inputs=False is not supported yet"
 
+    if core._in_eager_mode():
+        return core.eager.run_partial_grad(
+            outputs, inputs, grad_outputs, retain_graph, create_graph,
+            only_inputs, allow_unused, no_grad_vars)
+
     place = core.Place()
     place.set_place(framework._current_expected_place())
     return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 30012fb8666fcb5256efa889de7440f6d709cccd..900541459f6fcd3f8caaf9d60b0aabba5c6c469e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -104,7 +104,7 @@ class FunctionSpec(object):
             if isinstance(input_var, np.ndarray):
                 input_var = paddle.static.InputSpec.from_numpy(input_var)
                 _set_spec_stop_gradient(input_var, True)
-            elif isinstance(input_var, core.VarBase):
+            elif isinstance(input_var, (core.VarBase, core.eager.Tensor)):
                 stop_gradient = input_var.stop_gradient
                 input_var = paddle.static.InputSpec.from_tensor(input_var)
                 _set_spec_stop_gradient(input_var, stop_gradient)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index a442a8b92b6f7cf6c5c366e63ace110e9fb94e01..216f955b7510351c2dc6774a34a485b2341e76aa 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -148,10 +148,7 @@ class PartialProgramLayer:
 
         self._origin_main_program = self._verify_program(main_program)
         self._tmp_scope_vec = self._create_scope_vec()
-        # A fake_var to handle empty input or output
-        self.__fake_vars = _create_fake_var()
         # Set default mode to train
-        self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
         custom_white_list, custom_black_list = None, None
@@ -163,6 +160,14 @@ class PartialProgramLayer:
             custom_white_list=custom_white_list,
             custom_black_list=custom_black_list)
 
+    @LazyInitialized
+    def __fake_vars(self):
+        return _create_fake_var()
+
+    @LazyInitialized
+    def _double_grads(self):
+        return self._get_double_grads(self._origin_main_program)
+
     @LazyInitialized
     def _infer_program(self):
         """
@@ -356,8 +361,10 @@ class PartialProgramLayer:
 
     def drop_scope_if_no_grad(self):
         tracer = framework._dygraph_tracer()
+        scope = self._tmp_scope_vec.value().get_scope() if isinstance(
+            self._tmp_scope_vec, (core.VarBase)) else self._tmp_scope_vec[0]
         if self.training and not tracer._has_grad:
-            self._tmp_scope_vec.value().get_scope().drop_kids()
+            scope.drop_kids()
 
     @property
     def program(self):
@@ -449,18 +456,14 @@ class PartialProgramLayer:
     def _create_scope_vec(self):
         # Hold forward variables
         tmp_scope_vec = None
+        inner_scope = core.Scope()
         if not core._in_eager_mode():
             tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                          "program_out_scope",
                                          core.VarDesc.VarType.STEP_SCOPES, True)
-            # TODO(jiabin): Support this later.
-            # else:
-            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
-            #                                 "program_out_scope",
-            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
-
-            inner_scope = core.Scope()
             tmp_scope_vec.value().set_scope(inner_scope)
+        else:
+            tmp_scope_vec = [inner_scope]
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -598,12 +601,10 @@ def _create_fake_var():
                          core.VarDesc.VarType.RAW, False)
         ]
     else:
-        return []
-        # TODO(jiabin): Support this later
-        # return [
-        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
-        #                 core.VarDesc.VarType.RAW, False)
-        # ]
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index e1df2324889b440737740e443228d0fa69b47b51..7733226cc09f2d6e2f9bcb8403ed1be42aa75e0c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -297,10 +297,6 @@ class TensorShapeTransformer(gast.NodeTransformer):
         return False
 
     def _update_name_to_var_shape(self, node):
-        def replace_dot(name):
-            # replace all '.' into '_'
-            return name.replace('.', '_')
-
         assert isinstance(node, gast.Assign)
         target_node = node.targets[0]
         value_node = node.value
@@ -315,7 +311,6 @@ class TensorShapeTransformer(gast.NodeTransformer):
                     if value_node.id in self.name_to_var_shape:
                         # TODO(zhhsplendid): is context a problem for the result node of gast.parse?
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -337,7 +332,6 @@ class TensorShapeTransformer(gast.NodeTransformer):
                 if isinstance(value_node, gast.Attribute):
                     if self._is_var_shape(value_node):  # eg: x.shape
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -370,7 +364,6 @@ class TensorShapeTransformer(gast.NodeTransformer):
             if isinstance(value_node, gast.Name):
                 if value_node.id in self.name_to_var_shape:
                     static_shape_var_name = unique_name.generate(
-                        replace_dot(target_id) +
                         STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                     static_shape_var_node = gast.parse(
                         static_shape_var_name).body[0].value
@@ -387,7 +380,7 @@ class TensorShapeTransformer(gast.NodeTransformer):
                     self.name_to_var_shape[target_id] = static_shape_var_name
             elif self._is_var_shape(value_node):  # eg: x.shape or x.shape[0]
                 static_shape_var_name = unique_name.generate(
-                    replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                    STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                 static_shape_var_node = gast.parse(static_shape_var_name).body[
                     0].value
                 static_shape_value_node = copy.deepcopy(value_node)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index f58952d3036c506341955eff2472079bb696bb1f..a36164a277dec0762e7ba49a1d158837f27bc517 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,6 +30,7 @@ from paddle.fluid.layers import nn
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import in_dygraph_mode
+from paddle import _C_ops
 
 __all__ = ['TranslatedLayer']
 
@@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path,
     return var_dict
 
 
+def _valid_vars(vars):
+    if vars:
+        return vars
+    if framework._in_eager_mode():
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+
+
 def _run_dygraph(instance, input, program_holder):
 
     # 1. prepare inputs, outputs, attrs
@@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder):
 
     # hold forward variables
     if framework._in_eager_mode():
-        tmp_scope_vec = core.eager.Tensor(
-            dtype=core.VarDesc.VarType.FP32,
-            dims=[],
-            name="program_out_scope",
-            type=core.VarDesc.VarType.STEP_SCOPES,
-            persistable=True)
+        tmp_scope_vec = [program_holder.scope]
     else:
         tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                      "program_out_scope",
                                      core.VarDesc.VarType.STEP_SCOPES, True)
-    tmp_scope_vec.value().set_scope(program_holder.scope)
+        tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
@@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder):
                                var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
-    if len(double_grad_vars) == 0:
-        if framework._in_eager_mode():
-            double_grad_vars = [
-                core.eager.Tensor(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
-        else:
-            double_grad_vars = [
-                core.VarBase(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
-    framework._dygraph_tracer().trace_op(
-        type='run_program',
-        inputs={'X': input_vars,
-                'Params': persistable_vars},
-        outputs={
-            'Out': output_vars,
-            'OutScope': tmp_scope_vec,
-            'DOut': double_grad_vars
-        },
-        attrs={
-            'global_block': trace_program.block(0),
-            'start_op_index': 0,
-            'end_op_index': end_op_index,
-            'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program, instance)
-        })
+    attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
+             'end_op_index', end_op_index, 'is_test', instance._is_test,
+             'program_id', _hash_with_id(trace_program, instance))
+    _C_ops.run_program(
+        _valid_vars(input_vars),
+        _valid_vars(persistable_vars),
+        _valid_vars(output_vars), tmp_scope_vec,
+        _valid_vars(double_grad_vars), *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
@@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder):
 
 def drop_scope_if_no_grad(instance, scope_vec):
     tracer = framework._dygraph_tracer()
+    scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
+        core.VarBase)) else scope_vec[0]
     if (not instance._is_test) and (not tracer._has_grad):
-        scope_vec.value().get_scope().drop_kids()
+        scope.drop_kids()
 
 
 def _run_static_graph(input, program_holder, trace_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index b1865691b2475c4f855f51244e627965047d7720..1e1ce3ba7e4912d391085c7acbd7aa4bbb6a4da1 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs):
         for var in flatten(input_spec):
             if isinstance(var, paddle.static.InputSpec):
                 inner_input_spec.append(var)
-            elif isinstance(var, (core.VarBase, Variable)):
+            elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)):
                 inner_input_spec.append(
                     paddle.static.InputSpec.from_tensor(var))
             else:
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 53dbf1a66b27f35a75b44a0b6444cd8282c5278c..6957850d205794363183b4e6ca58a6daf3e11358 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -760,7 +760,8 @@ class Layer(object):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not type(tensor) == core.VarBase:
+        elif tensor is not None and not (type(tensor) == core.VarBase or
+                                         type(tensor) == core.eager.Tensor):
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
                 format(type(tensor).__name__))
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 652916491eed7e511b610c2d00b0612604ecee8b..86d76f1b20a74c9f8bb51e23d9fc7d450717f173 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
 from ..layers import collective
 from paddle.fluid.dygraph import base as imperative_base
-from paddle.fluid.framework import ParamBase
+from paddle.fluid.framework import ParamBase, _in_eager_mode
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
@@ -397,6 +397,16 @@ def sync_params_buffers(model,
                    'axis': 0})
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_eager_params(model, comm_group=None, src_rank=0):
+    for _, param in model._obtain_parameters_buffers().items():
+        if not isinstance(param, core.eager.Tensor):
+            raise TypeError("The data type of '%s' must be '%s'" %
+                            (param.name, core.eager.Tensor))
+        comm_group.broadcast(param, src_rank).synchronize()
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -576,6 +586,7 @@ class DataParallel(layers.Layer):
         self.process_group = process_group
         self.gradient_as_buffer_view = gradient_as_buffer_view
         self.static_graph = static_graph
+        self.var_dtype = core.eager.Tensor if _in_eager_mode() else core.VarBase
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -592,11 +603,20 @@ class DataParallel(layers.Layer):
             "ParallelContext must be initialized before. You should use init_parallel_env() before" \
             "constructing the DataParallel."
 
+            if self.process_group is None and _in_eager_mode():
+                raise RuntimeError(
+                    "Process group should be built in DataParallel of eager mode."
+                )
+
             # sync buffer and params
             # TODO(liuyuhui) Currently not support xpu. xpu is 
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
-                sync_params_buffers(self._layers)
+                if _in_eager_mode():
+                    sync_eager_params(
+                        self._layers, comm_group=self.process_group)
+                else:
+                    sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
@@ -620,9 +640,9 @@ class DataParallel(layers.Layer):
                 if param is None or param in params_set:
                     continue
                 params_set.add(param)
-                if not isinstance(param, core.VarBase):
-                    raise TypeError("The data type of '%s' must be Varbase" %
-                                    param.name)
+                if not isinstance(param, self.var_dtype):
+                    raise TypeError("The data type of '%s' must be '%s'" %
+                                    (param.name, self.var_dtype))
                 if param.trainable:
                     layers_param.append((sublayer, param))
 
@@ -649,19 +669,32 @@ class DataParallel(layers.Layer):
             check_layer_sparse(sublayer) for sublayer, _ in layers_param
         ]
 
-        self.group_indices = core.assign_group_by_size(
-            trainable_parameters, is_sparse_gradient,
-            [self.last_comm_buffer_size, self.comm_buffer_size])
+        if _in_eager_mode():
+            self.group_indices = core.eager_assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
+
+            self._reducer = core.EagerReducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                self.process_group,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
+        else:
+            self.group_indices = core.assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        self._reducer = core.Reducer(
-            trainable_parameters,
-            list(reversed(self.group_indices)), is_sparse_gradient,
-            parallel_helper.__parallel_ctx__clz__,
-            [self.last_comm_buffer_size, self.comm_buffer_size],
-            self.find_unused_parameters)
+            self._reducer = core.Reducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                parallel_helper.__parallel_ctx__clz__,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
 
     def _find_varbase(self, obj):
-        if isinstance(obj, core.VarBase):
+        var_type = core.eager.Tensor if _in_eager_mode() else core.VarBase
+        if isinstance(obj, var_type):
             return [obj]
         if isinstance(obj, (list, tuple)):
             return itertools.chain(*map(self._find_varbase, obj))
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 563cd433910054522b48b9b0f03a036d0d5abe69..d8b1883fc62a0fb4575a2e525d7d37a9029cf40d 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -35,6 +35,12 @@ final_state_name_mapping = {
         "x": "X",
         "out": "Out",
     },
+    "pool2d": {
+        "final_op_name": "final_state_pool2d",
+        "x": "X",
+        "kernel_size": "ksize",
+        "out": "Out",
+    },
     "abs": {
         "final_op_name": "final_state_abs",
         "x": "X",
@@ -52,6 +58,12 @@ final_state_name_mapping = {
         "axis1": "axis1",
         "axis2": "axis2",
         "out": "Out",
+    },
+    "one_hot": {
+        "final_op_name": "final_state_one_hot",
+        "x": "X",
+        "num_class": "depth",
+        "out": "Out",
     }
 }
 
@@ -140,7 +152,12 @@ class Tracer(core.Tracer):
                             outputs[retname][j].reconstruct_from_(returns[i][j],
                                                                   False)
                     else:
-                        outputs[retname][0].reconstruct_from_(returns[i], False)
+                        if isinstance(outputs[retname], list):
+                            outputs[retname][0].reconstruct_from_(returns[i],
+                                                                  False)
+                        else:
+                            outputs[retname].reconstruct_from_(returns[i],
+                                                               False)
         elif isinstance(returns, list):
             assert len(outputs.keys()) == 1
             key = list(outputs.keys())[0]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6843c0e4c3fa85f20b408e7536cf1902dafe3f45..2b67a2029727f6b8f917239094a1b906d5cd6a62 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -311,7 +311,7 @@ def monkey_patch_varbase():
 
         """
         if core._in_eager_mode():
-            if not self.grad._is_initialized():
+            if self.grad is None:
                 return None
             # TODO(wanghuancoder) support SELECTED_ROWS
             return self.grad.numpy()
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fd7226c48661fdb2cd4dcf7227d0f8383c6c9439..000f08b0a3e282d815c758b5a153ba53ff84c8e0 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one dimension value of 'shape' in reshape can "
-                        "be -1. But received shape[%d] is also -1." % dim_idx)
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx)
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c89990be34ca06c6277033bb6b6c0844e7d9a327..acaf7cb74280bed23b20feab2a96aa85a9bb5cea 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -153,6 +153,7 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
     .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
 
 void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
@@ -164,6 +165,7 @@ void relu_cpu_backward_out(const paddle::Tensor& x,
                            const paddle::Tensor& out,
                            const paddle::Tensor& grad_out,
                            paddle::Tensor* grad_x) {
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 33c5ede299bd47c87490473920bb80b18cd75bf5..4bb773cdaec21712f262bcb217710f6909efd20a 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -94,6 +94,7 @@ void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -108,6 +109,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
   int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
index 1c9c6eedbaeb8c1c3f06d42d82a8ec5cc28750f6..785bfc74229817c022f7f9e80481dde156d4e178 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
@@ -20,6 +20,7 @@ import paddle
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -53,7 +54,7 @@ class TestJitCustomAttrs(unittest.TestCase):
         self.int64_vec_attr = [10000000000, 10000000000, 10000000000]
         self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"]
 
-    def test_attr_value(self):
+    def func_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.attr_test(
@@ -65,7 +66,12 @@ class TestJitCustomAttrs(unittest.TestCase):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
-    def test_const_attr_value(self):
+    def test_attr_value(self):
+        with _test_eager_guard():
+            self.func_attr_value()
+        self.func_attr_value()
+
+    def func_const_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.const_attr_test(
@@ -77,6 +83,11 @@ class TestJitCustomAttrs(unittest.TestCase):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
+    def test_const_attr_value(self):
+        with _test_eager_guard():
+            self.func_const_attr_value()
+        self.func_const_attr_value()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index 9049b604c910c80a41afa2c509e2ec3fdb4ffbfc..62e61c5bc7f5f235c03d146bd77fee41948a2a05 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -21,6 +21,7 @@ import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,7 +117,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
             "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
                                                            pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
@@ -128,6 +129,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             for axis in self.axises:
@@ -140,7 +146,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 self.check_output(x1_grad, pd_x1_grad, "x1_grad")
                 self.check_output(x2_grad, pd_x2_grad, "x2_grad")
 
-    def test_dynamic_with_attr(self):
+    def func_dynamic_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(
@@ -153,6 +159,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic_with_attr(self):
+        with _test_eager_guard():
+            self.func_dynamic_with_attr()
+        self.func_dynamic_with_attr()
+
     def test_static_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index 25c88ee6c6b01daf62553f5f7857bfe06fce25ca..5f3c107a9b22ad2014bd5e2488c0f48a6866fad8 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -21,6 +21,7 @@ import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,11 +117,16 @@ class TestCustomConjJit(unittest.TestCase):
         self.check_output(out, pd_out, "out")
         self.check_output(x_grad, pd_x_grad, "x's grad")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
             self.run_dynamic(dtype, np_input)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index 0ba70eaa3e06cec619b2a9175db4aa1c8bf75a8b..811eedf1edaf39c961f9fd292054c2cce5154db9 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -22,6 +22,7 @@ import paddle.nn.functional as F
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -94,7 +95,7 @@ class TestCustomLinearJit(unittest.TestCase):
                                    self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
                                       self.np_weight, self.np_bias)
@@ -102,6 +103,11 @@ class TestCustomLinearJit(unittest.TestCase):
                                     self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
index 207ea87974130698b8bf491bce3cd753045a97e9..4da99b1ea10418c6cb6baddb51596b307c6ba28d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
@@ -68,12 +68,6 @@ class TestCustomRawReluOp(unittest.TestCase):
         self.assertTrue(custom_raw_relu_op is not None)
         return custom_raw_relu_op(x)
 
-    def test_dygraph(self):
-        x = paddle.to_tensor(np.random.uniform(low=-1.0, high=1.0, size=[2, 3]))
-        y1 = self.custom_raw_relu(x)
-        y2 = paddle.nn.ReLU()(x)
-        self.assertTrue(np.array_equal(y1.numpy(), y2.numpy()))
-
     def test_static(self):
         paddle.enable_static()
         shape = [2, 3]
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index dddb14eb78e8a1dc6a0820ead2ebfa915b8a09c2..81793f1391d0422393c1f6c1e719f708112d3b6b 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -22,6 +22,7 @@ from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -98,7 +99,7 @@ class TestDygraphModel(unittest.TestCase):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, self.in_dim], dtype='float32', name='x')
 
-    def test_train_eval(self):
+    def func_train_eval(self):
         for device in self.devices:
             # set device
             paddle.set_device(device)
@@ -106,26 +107,34 @@ class TestDygraphModel(unittest.TestCase):
             # for train
             origin_relu_train_out = self.train_model(use_custom_op=False)
             custom_relu_train_out = self.train_model(use_custom_op=True)
-            custom_relu_dy2stat_train_out = self.train_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            # open this when dy2stat is ready for eager 
+            if not _in_eager_mode():
+                custom_relu_dy2stat_train_out = self.train_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_train_out,
+                                   custom_relu_dy2stat_train_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_train_out, custom_relu_train_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_train_out,
-                               custom_relu_dy2stat_train_out))
 
             # for eval
             origin_relu_eval_out = self.eval_model(use_custom_op=False)
             custom_relu_eval_out = self.eval_model(use_custom_op=True)
-            custom_relu_dy2stat_eval_out = self.eval_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            if not _in_eager_mode():
+                custom_relu_dy2stat_eval_out = self.eval_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_eval_out,
+                                   custom_relu_dy2stat_eval_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_eval_out, custom_relu_eval_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_eval_out,
-                               custom_relu_dy2stat_eval_out))
+
+    def test_train_eval(self):
+        with _test_eager_guard():
+            self.func_train_eval()
+        self.func_train_eval()
 
     def train_model(self, use_custom_op=False, dy2stat=False):
         # reset random seed
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 407eb342ba99ba09d0f6faa8686e12c2a1100cdb..a747d10823ec5572e73e7ec8ae3e5da528e3e88d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -20,7 +20,7 @@ from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
-
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
@@ -75,7 +75,7 @@ class TestJITLoad(unittest.TestCase):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -95,8 +95,14 @@ class TestJITLoad(unittest.TestCase):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
-    def test_exception(self):
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
+    def func_exception(self):
         caught_exception = False
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
@@ -114,11 +120,11 @@ class TestJITLoad(unittest.TestCase):
                     "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
                     str(e))
         self.assertTrue(caught_exception)
-
         caught_exception = False
         # MAC-CI don't support GPU
         if IS_MAC:
             return
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
@@ -132,6 +138,11 @@ class TestJITLoad(unittest.TestCase):
                 str(e))
         self.assertTrue(caught_exception)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     def test_load_multiple_module(self):
         custom_module = load(
             name='custom_conj_jit',
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 0af0aa16466ea82eeb4a9558bdbcb3de69489bb4..7c61e11a18ecd2ebbcd87fae37a8ba0a39ad56d1 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,6 +21,7 @@ import paddle.static as static
 import subprocess
 import numpy as np
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard
 
 
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
@@ -216,7 +217,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -236,6 +237,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
index c60bac4060b643c01be87d82c1fcde4a8ae4be7e..f68a37b1a2f3b87f4126953ae50464d5ad8e6fe3 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
@@ -20,6 +20,7 @@ import paddle
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -39,7 +40,7 @@ custom_ops = load(
 
 
 class TestCustomSimpleSliceJit(unittest.TestCase):
-    def test_slice_output(self):
+    def func_slice_output(self):
         np_x = np.random.random((5, 2)).astype("float32")
         x = paddle.to_tensor(np_x)
         custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
@@ -48,6 +49,11 @@ class TestCustomSimpleSliceJit(unittest.TestCase):
             np.array_equal(custom_op_out, np_out),
             "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy()))
 
+    def test_slice_output(self):
+        with _test_eager_guard():
+            self.func_slice_output()
+        self.func_slice_output()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 12e9f50a5e4092a067c533bcdb6bcb03011d35fa..0d2cb941eafaa188c7be458b08f0f5ef35ab6238 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -19,7 +19,7 @@ import numpy as np
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory())
@@ -39,7 +39,7 @@ class TestJitDispatch(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
-    def run_dispatch_test(self, func, dtype):
+    def run_dispatch_test_impl(self, func, dtype):
         np_x = np.ones([2, 2]).astype(dtype)
         x = paddle.to_tensor(np_x)
         out = func(x)
@@ -50,6 +50,11 @@ class TestJitDispatch(unittest.TestCase):
             np.array_equal(np_x, np_out),
             "custom op x: {},\n custom op out: {}".format(np_x, np_out))
 
+    def run_dispatch_test(self, func, dtype):
+        with _test_eager_guard():
+            self.run_dispatch_test_impl(func, dtype)
+        self.run_dispatch_test_impl(func, dtype)
+
     def test_dispatch_integer(self):
         dtypes = ["int32", "int64", "int8", "uint8", "int16"]
         for dtype in dtypes:
diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
index 97b37498c4d3d0a6b2c336c240aaf116b11e0407..4fc9270b0f44cc5778775bdab4c2b7cab95c8c3a 100644
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
@@ -22,7 +22,7 @@ from paddle.utils.cpp_extension import load
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory())
@@ -84,7 +84,7 @@ class TestMultiOutputDtypes(unittest.TestCase):
                 self.check_multi_outputs(res)
         paddle.disable_static()
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 paddle.set_device(device)
@@ -95,6 +95,11 @@ class TestMultiOutputDtypes(unittest.TestCase):
                 self.assertTrue(len(outs) == 3)
                 self.check_multi_outputs(outs, True)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e75b8d1f60bf7dbbfb500a464a3b591a0d1f7ed3..44e6f8e8f2a6d11371f21fff5a9dccefcd72ebed 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -557,6 +557,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
+  list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing)
 endif()
 
 if (NOT WITH_GLOO)
@@ -948,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_run PROPERTIES TIMEOUT 200)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
@@ -958,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
@@ -1116,9 +1119,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
     set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
@@ -1174,6 +1177,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
             test_collective_global_scatter
             PROPERTIES LABELS "RUN_TYPE=DIST")
     endif()
+    set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b499a9e01c36eefcb9b6cb91956abc5ee0a99b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
+
+
+class MyOwnLayer(Layer):
+    def __init__(self):
+        super(MyOwnLayer, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+static_tensor = None
+static_tensor_mask = None
+
+
+def my_own_pruning(tensor, m, n, mask_algo, param_name):
+    global static_tensor
+    global static_tensor_mask
+    if static_tensor is None:
+        static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
+    if static_tensor_mask is None:
+        static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
+    return static_tensor, static_tensor_mask
+
+
+class TestASPAddSupportedLayer(unittest.TestCase):
+    def test_add_supported_layer_via_name(self):
+        sparsity.add_supported_layer("test_supported_1")
+        sparsity.add_supported_layer("test_supported_2", my_own_pruning)
+        sparsity.add_supported_layer(MyOwnLayer)
+        my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__)
+
+        self.assertTrue(
+            "test_supported_1" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"]
+                        == my_own_pruning)
+        self.assertTrue(
+            my_own_layer_name in supported_layers_and_prune_func_map)
+
+
+class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        self.customer_prefix = "customer_layer"
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+            self.supported_layer_count_ref = 5
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+
+        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
+
+    def test_inference_pruning(self):
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                mat_mask = np.array(fluid.global_scope().find_var(
+                    sparsity.asp.ASPHelper._get_mask_name(param.name))
+                                    .get_tensor())
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 80bc206ae7b7952aea55cb93bd42346dc019633b..a730d21afa57980538841a3ad7fe874fd2343d4a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -9,4 +9,12 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
     set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+    py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
+    py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
+    set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+
+    py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
+    py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
+    py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
+    py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34f267b4237bf5ebe19adda1c90f1c147294333
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+def test_convert():
+    rank_id = paddle.distributed.get_rank()
+    complete_tensor = np.arange(64).reshape([8, 8])
+    tensor_row = np.split(complete_tensor, 2, axis=0)
+    tensor_col = np.split(complete_tensor, 2, axis=1)
+    tensor_name = "tensor_0"
+    complet_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, -1]
+        }
+    }
+    row_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    col_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, 0]
+        }
+    }
+
+    # test merge
+    tensor_dict = {tensor_name: tensor_row}
+    converter = Converter(tensor_dict, row_strategy, complet_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], complete_tensor).all()
+
+    # test slice
+    tensor_dict = {tensor_name: [complete_tensor]}
+    converter = Converter(tensor_dict, complet_strategy, col_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_col[rank_id]).all()
+
+    # test merge and slice
+    tensor_dict = {tensor_name: tensor_col}
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_row[rank_id]).all()
+
+    # test merge and slice with prefix match
+    new_name = "tensor_1"
+    row_strategy = {
+        new_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert(strict=False)
+    assert np.equal(convert_tensor_dict[new_name], tensor_row[rank_id]).all()
+
+
+if __name__ == "__main__":
+    test_convert()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbadbb7d8c1cfe8bb92e0287694d48a0a546f206
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+class TestConverter(unittest.TestCase):
+    def test_converter(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "converter.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+    def test_input_invalid(self):
+        with self.assertRaises(ValueError):
+            Converter({}, [], [])
+        with self.assertRaises(TypeError):
+            Converter([0, 1], [], [])
+        with self.assertRaises(ValueError):
+            Converter({"tmp_0": [0]}, {}, [])
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, [0], [])
+
+        strategy_1 = {
+            'tmp_0': {
+                "process_shape": [1],
+                "process_group": [0],
+                "dims_mapping": [-1]
+            }
+        }
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, strategy_1, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index a7d51a7e176d475763f7368c509dd926e81d0b0f..d150da761aad3de3ab09f257d3b638cf37c27996 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab704a6a25714ef2fb935d6e3e776105aa4142cc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from paddle.distributed.auto_parallel.tuner import recorder as rd
+
+
+class TestRecorder(unittest.TestCase):
+    def test_register(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        self.assertEqual(set(recorder.records.keys()), {"metric"})
+        self.assertEqual(recorder.records["metric"].direction, "min")
+
+    def test_exists(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric", direction="max")
+        self.assertTrue(recorder.exists("metric"))
+
+    def test_update(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 4, 1000)
+        self.assertEqual(recorder.records["metric"].direction, "min")
+        self.assertEqual(
+            recorder.get_records("metric"), [rd.MetricRecord(4, 1000)])
+
+    def test_get_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 1, step=0)
+        recorder.update("metric", 2, step=1)
+        recorder.update("metric", 3, step=2)
+        recorder.update("metric", 4, step=3)
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_set_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_get_best_value(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric_min", "min")
+        recorder.register("metric_max", "max")
+
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_min"), 1)
+
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_max"), 4)
+
+    def test_get_best_step(self):
+        recorder = rd.MetricsRecorder()
+
+        recorder.register("metric_min", "min")
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_min"), 0)
+
+        recorder.register("metric_max", "max")
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_max"), 3)
+
+    def test_get_statistics(self):
+        recorder = rd.MetricsRecorder()
+        records = [rd.MetricRecord(np.random.random(), i) for i in range(14)]
+        recorder.set_records("metric", records)
+        stats = recorder.get_statistics("metric")
+        records = [r.value for r in records]
+        self.assertEqual(stats["min"], np.min(records))
+        self.assertEqual(stats["max"], np.max(records))
+        self.assertEqual(stats["mean"], np.mean(records))
+        self.assertEqual(stats["median"], np.median(records))
+        self.assertEqual(stats["var"], np.var(records))
+        self.assertEqual(stats["std"], np.std(records))
+
+    def test_serialization(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        print(recorder.get_state())
+        new_recorder = rd.MetricsRecorder.from_state(recorder.get_state())
+        self.assertEqual(new_recorder.records.keys(), recorder.records.keys())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc52d1c394effc223a609ae5db73ea89a25c298b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+from paddle.distributed.auto_parallel.tuner import trial as tr
+
+
+class TestTiral(unittest.TestCase):
+    def test_trial(self):
+        space = ts.TunableSpace()
+        space.choice("choice", [0, 1, 2, 3], default=2)
+        trial = tr.Trial(space, trial_id="trial-1")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        self.assertEqual(trial.id, "trial-1")
+        self.assertEqual(trial.space.get_value("choice"), 2)
+        self.assertEqual(trial.best_step, 0)
+        self.assertEqual(trial.status, "RUNNING")
+
+    def test_serialization(self):
+        space = ts.TunableSpace()
+        space.int_range("int_range", start=1, stop=4, default=2)
+        trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        new_trial = tr.Trial.from_state(trial.get_state())
+        self.assertEqual(new_trial.id, "trial-2")
+        self.assertEqual(new_trial.space.get_value("int_range"), 2)
+        self.assertEqual(new_trial.best_step, 0)
+        self.assertEqual(new_trial.status, "COMPLETED")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7104f9ef641c00af91f461495eae0caa3c7cd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+
+
+class TestTunableSpace(unittest.TestCase):
+    def test_fixed(self):
+        space = ts.TunableSpace()
+        fixed = space.fixed("fixed", default=4)
+        self.assertEqual(space.values["fixed"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+        space.values["fixed"] = 2
+        self.assertEqual(space.get_value("fixed"), 2)
+        self.assertEqual(space.values, {"fixed": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+    def test_boolean(self):
+        space = ts.TunableSpace()
+        boolean = space.boolean("boolean")
+        self.assertEqual(space.values["boolean"], False)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+        space.values["boolean"] = True
+        self.assertEqual(space.get_value("boolean"), True)
+        self.assertEqual(space.values, {"boolean": True})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+    def test_choice(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        space.values["choice"] = 2
+        self.assertEqual(space.get_value("choice"), 2)
+        self.assertEqual(space.values, {"choice": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+    def test_int_range(self):
+        space = ts.TunableSpace()
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+        space.values["int_range"] = 3
+        self.assertEqual(space.get_value("int_range"), 3)
+        self.assertEqual(space.values, {"int_range": 3})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_float_range(self):
+        space = ts.TunableSpace()
+        float_range = space.float_range(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        self.assertEqual(space.values["float_range"], 2.0)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+        space.values["float_range"] = 3.0
+        self.assertEqual(space.get_value("float_range"), 3.0)
+        self.assertEqual(space.values, {"float_range": 3.0})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+    def test_varibles(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 2)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_not_populated_variable(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=2)
+        self.assertEqual(choice, 2)
+
+    def test_populated_variable(self):
+        space = ts.TunableSpace()
+        space.values["choice"] = 2
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(choice, 2)
+
+        space["choice"] = 3
+        self.assertNotEqual(space.values["choice"], 2)
+        self.assertEqual(space.values["choice"], 3)
+
+    def test_state(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+
+        new_space = space.from_state(space.get_state())
+        self.assertEqual(new_space.get_value("choice"), 4)
+        self.assertEqual(new_space.get_value("int_range"), 2)
+        self.assertEqual(len(new_space.variables), 2)
+        self.assertEqual(len(new_space.values), 2)
+
+        self.assertEqual(new_space.variables["choice"].name, "choice")
+        self.assertEqual(new_space.variables["choice"].default, 4)
+        self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4])
+
+        self.assertEqual(new_space.variables["int_range"].name, "int_range")
+        self.assertEqual(new_space.variables["int_range"].default, 2)
+        self.assertEqual(new_space.variables["int_range"].start, 1)
+        self.assertEqual(new_space.variables["int_range"].stop, 4)
+        self.assertEqual(new_space.variables["int_range"].step, 1)
+        self.assertEqual(new_space.variables["int_range"].endpoint, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..c36fca7a9d09a6fb15664226ac0be441fbf49c3e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_variable as tv
+
+
+class TestTunableVariable(unittest.TestCase):
+    def test_fixed(self):
+        fixed = tv.Fixed("fixed", True)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, True)
+        self.assertEqual(fixed.random(), True)
+
+        fixed = tv.Fixed("fixed", 1)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, 1)
+        self.assertEqual(fixed.random(), 1)
+
+    def test_boolean(self):
+        boolean = tv.Boolean("bool")
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, False)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+        boolean = tv.Boolean("bool", True)
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, True)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+    def test_choice(self):
+        choice = tv.Choice("choice", [1, 2, 3, 4])
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 1)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+        choice = tv.Choice("choice", [1, 2, 3, 4], default=2)
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 2)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+    def test_int_range(self):
+        int_range = tv.IntRange("int_range", start=1, stop=4, default=2)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 2)
+        self.assertIn(int_range.random(), [1, 2, 3, 4])
+        self.assertIn(int_range.random(1234), [1, 2, 3, 4])
+        self.assertNotEqual(int_range.default, 4)
+
+        int_range = tv.IntRange(
+            "int_range", start=1, stop=8, step=2, default=3, endpoint=True)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 3)
+        self.assertIn(int_range.random(), [1, 3, 5, 7])
+        self.assertIn(int_range.random(1234), [1, 3, 5, 7])
+        self.assertNotEqual(int_range.default, 2)
+
+    def test_float_range(self):
+        float_range = tv.FloatRange(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 2.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLess(float_range.random(1234), 4.4)
+        self.assertNotAlmostEqual(float_range.random(), 1)
+        self.assertNotAlmostEqual(float_range.random(), 4.4)
+
+        float_range = tv.FloatRange(
+            "float_range",
+            start=0.4,
+            stop=8.4,
+            step=2.0,
+            default=3.0,
+            endpoint=True)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 3.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLessEqual(float_range.random(1234), 8.4)
+        self.assertNotAlmostEqual(float_range.random(), 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1179fd9a9f0887f5133349118eb5b4c8fbab733d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def loop_cond(i, loop_len, input_array):
+    return i < loop_len
+
+
+def loop_body(i, loop_len, input_array):
+    pre_input = paddle.tensor.array_read(array=input_array, i=i)
+    mlp_while0 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    mlp_while1 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    output = mlp_while0(pre_input)
+    cur_pred = mlp_while1(output)
+    # 更新循环条件
+    i = paddle.increment(x=i, value=1)
+    paddle.tensor.array_write(cur_pred, array=input_array, i=i)
+    return i, loop_len, input_array
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+
+        # 循环计数器
+        i = paddle.full(shape=[1], fill_value=0, dtype='int64')
+        # 循环次数
+        loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
+
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        input_array = paddle.tensor.array_write(pred, i)
+        i, loop_len, input_array = static.nn.while_loop(
+            cond=loop_cond,
+            body=loop_body,
+            loop_vars=[i, loop_len, input_array])
+        end_pred = paddle.tensor.array_read(array=input_array, i=i)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+    return train_program, start_program, dataloader, i, loss
+
+
+class TestMLP(unittest.TestCase):
+    def test_completer(self):
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = DistributedContext()
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
+        # print_program_with_dist_attr(complete_train_program, dist_context)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index cac64c7391351b23c4b9f9275c4b20bf85f571fd..2b8307461b8f57ea73503cf6ad4e8a90cdba652c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.fluid.framework import _test_eager_guard
 
 from predictor_utils import PredictorTools
 
@@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST):
             np.allclose(dygraph_loss, static_loss),
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
+        with _test_eager_guard():
+            dygraph_loss = self.train_dygraph()
+            static_loss = self.train_static()
+            self.assertTrue(
+                np.allclose(dygraph_loss, static_loss),
+                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
+                                                                static_loss))
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
         dygraph_loss_cpu = self.train_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 06d69daa75d1c755d3c9f2b111e31297c4905d8f..d05be03bbfb193ae25ee039aef1608afdef4f585 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape():
         print(x)
 
 
+def dyfunc_dict_assign_shape():
+    x = paddle.to_tensor([1, 2])
+    a = {}
+    a['shape'] = x.shape[0]
+
+
 # 1. Basic tests without control flow
 class TestTensorShapeBasic(unittest.TestCase):
     def setUp(self):
@@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
         self.assertEqual('paddle.shape(x)' in func.code, True)
+        func = paddle.jit.to_static(dyfunc_dict_assign_shape)
+        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 959700ad743b40420200b56055354279386a9a7c..79a2430a161703348824d8e4e687bf85569c408a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -4,5 +4,11 @@ if(WITH_IPU)
 
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+        # set all UTs timeout to 200s
+        set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)    
     endforeach(TEST_OP)
+
+    set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
deleted file mode 100644
index 58a88c113fc0b6b82c1c58d50a1b0824cb530632..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.nn.functional as F
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestRelu(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_training()
-        self.init_op()
-
-    def init_op(self):
-        self.op = paddle.fluid.layers.relu
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = self.op(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IpuCompiler(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def run_test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
-
-    def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
-        self.attrs = {}
-        self.set_feed_attr()
-        self.run_test_base()
-
-
-class TestTanh(TestRelu):
-    def init_op(self):
-        self.op = F.tanh
-
-
-class TestLog(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.log
-
-
-class TestSigmoid(TestRelu):
-    def init_op(self):
-        self.op = F.sigmoid
-
-
-class TestSqrt(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.sqrt
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 1dab958c1ecbc806df94c651cf4a2d6cd82f3ddb..c640cd441f1b2589bcc3ffa466b865d1fb34c582 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -115,7 +115,7 @@ class TestBase(IPUOpTest):
 
 class TestCase1(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
@@ -129,7 +129,7 @@ class TestCase1(TestBase):
 
 class TestCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
deleted file mode 100644
index aa6c05dc59a87f844c19912be484a4b007f0adfc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestBase(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_feed()
-        self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100])
-        self.feed_ipu = {"x": np_data.astype('float16')}
-        self.feed_cpu = {"x": np_data.astype('float32')}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values()
-        ]
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                add1 = conv1 + conv2
-                conv3 = paddle.static.nn.conv2d(
-                    add1, num_filters=8, filter_size=8, bias_attr=False)
-                out = paddle.fluid.layers.relu(conv3, **self.attrs)
-                fetch_list = [out.name]
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            feed = self.feed_ipu if run_ipu else self.feed_cpu
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=False)
-                ipu_strategy.SetHalfConfig(enable_fp16=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                feed_list = self.feed_list
-                program = main_prog
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(res0.shape == res1.shape)
-        mae = np.mean(np.abs(res0.flatten() - res1.flatten()))
-        print("mae is ", mae)
-        self.assertTrue(mae < 0.001)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
index 66c547de2c28068a9605c506416baabdf229265d..2e84607e2f5c24bde4b3d291cc1b7f33f89c5751 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
@@ -25,17 +25,120 @@ from hypothesis import given, settings, seed, example, assume
 import hypothesis.strategies as st
 
 
-class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+# the two inputs of elementwise_add are tensor
+class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
         ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[1]['data_format'] == "NHWC":
+        if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0:
+            return False
+        if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1:
             return False
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        axis = draw(st.sampled_from([-1, 0]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(
+                [48, int(48 / groups), 3, 3]).astype(np.float32)
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["relu_out"]},
+            attrs={})
+
+        conv2d_op1 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["relu_out"],
+                    "Filter": ["conv_weight1"]},
+            outputs={"Output": ["conv_output1"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        conv2d_op2 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["input_data"],
+                    "Filter": ["conv_weight2"]},
+            outputs={"Output": ["conv_output2"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        elt_op = OpConfig(
+            type="elementwise_add",
+            inputs={"X": ["conv_output1"],
+                    "Y": ["conv_output2"]},
+            outputs={"Out": ["elementwise_output"]},
+            attrs={'axis': axis})
 
+        model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "conv_weight1": TensorConfig(data_gen=partial(generate_weight)),
+                "conv_weight2": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["elementwise_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
+
+
+'''
+class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        if "elementwise_weight" in program_config.weights:
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]:
+                if attrs[2]['axis'] != 1:
+                    return False
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]:
+                if attrs[2]['axis'] != -1:
+                    return False
         return True
 
     def sample_program_config(self, draw):
@@ -101,7 +204,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
                 "strides": strides
             })
 
-        if axis == -1 or axis == 0:
+        if axis == 0:
             elt_op = OpConfig(
                 type="elementwise_add",
                 inputs={"X": ["input_data1"],
@@ -118,14 +221,12 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
 
         model_net = [relu_op, conv2d_op, elt_op]
 
-        if axis == 1:
+        if axis == 0:
             program_config = ProgramConfig(
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1)),
-                    "elementwise_weight":
-                    TensorConfig(data_gen=partial(generate_weight2))
+                    TensorConfig(data_gen=partial(generate_weight1))
                 },
                 inputs={
                     "input_data1":
@@ -137,7 +238,9 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1))
+                    TensorConfig(data_gen=partial(generate_weight1)),
+                    "elementwise_weight":
+                    TensorConfig(data_gen=partial(generate_weight2))
                 },
                 inputs={
                     "input_data1":
@@ -154,7 +257,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
     def test(self):
         self.run_and_statis(
             quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
-
+'''
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 33df428388882f2e536ecebb15a1f5dae6a6afc5..81bb182802ede6a2b78ffc44345cdcf382d344af 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -19,6 +19,7 @@ import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import paddle
 
 import hypothesis
 from hypothesis import given, settings, seed, example, assume
@@ -104,4 +105,5 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..893bd3833430c1059d5251ae6039c274112cbddb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.core import PassVersionChecker
+
+
+class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
+    act_alpha = None
+    act_beta = None
+    pass_name = 'elt_act_mkldnn_fuse_pass'
+
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_A = fluid.data(
+                name="data_A", shape=[-1, 3, 100, 100], dtype="float32")
+            data_B = fluid.data(
+                name="data_B", shape=[-1, 3, 100, 100], dtype="float32")
+            elt_out = self.operand(data_A, data_B)
+            if self.act is not None:
+                if self.act_beta is not None:
+                    elt_out = self.act(elt_out, self.act_alpha, self.act_beta)
+                elif self.act_alpha is not None:
+                    elt_out = self.act(elt_out, self.act_alpha)
+                else:
+                    elt_out = self.act(elt_out)
+
+        self.feeds = {
+            "data_A": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_B": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [elt_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = None
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 4
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sigmoid
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5279b0edadd61467c3bebe55dfb83aea909267
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestElementWiseAddReluFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            return np.random.random(
+                [batch_size, 3, 100, 100]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "elementwise_add",
+            "op_inputs": {
+                "X": ["A"],
+                "Y": ["B"]
+            },
+            "op_outputs": {
+                "Out": ["add_output"]
+            },
+            "op_attrs": {}
+        }, {
+            "op_type": "relu",
+            "op_inputs": {
+                "X": ["add_output"]
+            },
+            "op_outputs": {
+                "Out": ["relu_output"]
+            },
+            "op_attrs": {}
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "A": TensorConfig(data_gen=partial(generate_input)),
+                "B": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["relu_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["elementwise_add"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..57fa56acd687582fa67c1592a7d5c505ca6cce06
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestCheckFiniteAndUnscaleOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "check_finite_and_unscale"
+        self.init_dtype()
+        self.init_test_case()
+
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x / scale)],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.nan
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains nan, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        x1[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3699da15b5356d2bf25341261be0c237e037ce5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid.layer_helper import LayerHelper
+from collections import OrderedDict
+
+
+def run_momentum_op(params,
+                    grads,
+                    velocitys,
+                    master_params,
+                    learning_rate,
+                    place,
+                    multi_precision,
+                    mu=0.9,
+                    rescale_grad=0.01,
+                    use_merged=False):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+        attrs = {
+            'mu': mu,
+            'multi_precision': multi_precision,
+            'rescale_grad': rescale_grad,
+        }
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+def run_momentum_op2(params,
+                     grads,
+                     velocitys,
+                     master_params,
+                     learning_rate,
+                     place,
+                     multi_precision,
+                     mu=0.9,
+                     rescale_grad=0.01,
+                     use_merged=False,
+                     use_nesterov=True):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                attrs = {
+                    'mu': mu,
+                    'multi_precision': multi_precision,
+                    'rescale_grad': rescale_grad,
+                    'use_nesterov': use_nesterov,
+                    'regularization_method': 'l2_decay',
+                    'regularization_coeff': 2.0,
+                }
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            attrs = {
+                'mu': mu,
+                'multi_precision': multi_precision,
+                'rescale_grad': rescale_grad,
+                'use_nesterov': use_nesterov,
+                'regularization_method':
+                ['l2_decay' for i in range(len(param_vars))],
+                'regularization_coeff': [2.0 for i in range(len(param_vars))],
+            }
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+class TestMergedMomentum(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged)
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+class TestMergedMomentum2(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32  # np.float16
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_nesterov, use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op2(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged,
+                use_nesterov=use_nesterov)
+
+        outs1 = run_op(use_nesterov=True, use_merged=True)
+        outs2 = run_op(use_nesterov=True, use_merged=False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+        outs3 = run_op(use_nesterov=False, use_merged=True)
+        outs4 = run_op(use_nesterov=False, use_merged=False)
+        self.assertEqual(len(outs3), len(outs4))
+        for j, (out3, out4) in enumerate(zip(outs3, outs4)):
+            self.assertTrue(np.allclose(out3, out4, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 8e31d58195be8b17243fd5203fd8ced17c11f183..e9d9af5c11366c258d1fdab34b1e9ea345b0bfad 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -22,4 +22,5 @@ if (WITH_ASCEND_CL)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index 877f9904f3407c8e600995c2cf65cf849d49cdd5..e01b2b691a28aa788836a7f0d66fb2723fc1b364 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -144,6 +144,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
     def setUp(self):
         self.set_npu()
+        self.init_dtype()
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
@@ -153,6 +154,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
         self.init_kernel_type()
         self.init_test_case()
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set()
@@ -210,11 +214,16 @@ class TestBatchNormOpTraining(unittest.TestCase):
             scale_shape = [c]
 
             np.random.seed(123)
-            x = np.random.random_sample(shape).astype(np.float32)
+            x = np.random.random_sample(shape).astype(self.dtype)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-            y_grad = np.random.random_sample(shape).astype(np.float32)
+
+            if self.dtype == np.float16:
+                mean = mean.astype(np.float32)
+                variance = variance.astype(np.float32)
+
+            y_grad = np.random.random_sample(shape).astype(self.dtype)
             momentum_var = np.array([momentum]).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -275,7 +284,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
                     inputs=inputs,
                     outputs=outputs,
                     attrs=attrs)
-                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+                block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
@@ -320,6 +329,11 @@ class TestBatchNormOpTraining(unittest.TestCase):
         pass
 
 
+class TestFP16BatchNormOpTraining(TestBatchNormOpTraining):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
     def init_test_case(self):
         self.use_global_stats = False
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
index 012a6e59e775f8ab2d27c23c779571022d6c194f..2e15a1eac2b4b891712fb5889a8974a04c5766c0 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
@@ -132,36 +132,50 @@ class TestDepthwiseConvNPU(OpTest):
         self.check_output_with_place(self.place, atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         if self.dilations[0] == 1 and self.dilations[1] == 1:
-            self.check_grad_with_place(
-                self.place, {'Input', 'Filter'},
-                'Output',
-                max_relative_error=0.03,
-                numeric_place=paddle.CPUPlace())
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            no_grad_set=set(['Filter']),
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
-        if self.dilations[0] == 1 and self.dilations[1] == 1:
             self.check_grad_with_place(
-                self.place, ['Filter'],
+                self.place, ['Input'],
                 'Output',
-                no_grad_set=set(['Input']),
+                no_grad_set=set(['Filter']),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                no_grad_set=set(['Filter']),
                 max_relative_error=0.03,
                 numeric_place=paddle.CPUPlace())
 
+    def test_check_grad_no_input(self):
+        if self.dilations[0] == 1 and self.dilations[1] == 1:
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
+
     def init_data_format(self):
         self.data_format = "NCHW"
 
@@ -267,32 +281,46 @@ class TestDepthwiseConvNPU_Padding(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.2)
+        else:
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.03,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.7,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.8,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_data_format(self):
         self.data_format = "NCHW"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
index d0dc86055a1635c8bac644570f17e158cb2adda3..4070d0267d95b6cec2d3a2cb9926f9b389b69c50 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
@@ -127,8 +127,6 @@ class TestConv2DOp(OpTest):
         self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), {'Input', 'Filter'},
             'Output',
@@ -136,8 +134,6 @@ class TestConv2DOp(OpTest):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Input'],
             'Output',
@@ -146,8 +142,6 @@ class TestConv2DOp(OpTest):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Filter'],
             'Output',
@@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest):
     def set_npu(self):
         self.__class__.use_npu = True
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         self.set_npu()
         self.op_type = "conv2d"
-        self.dtype = np.float32
+        self.init_dtype()
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -320,31 +317,45 @@ class TestConv2DOp_v2(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.1)
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.02,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
index 9b29fc812faedde2aa28c9b597c6e8449bbd36b0..a4769442b083eb845daa9f7989c8621a3d475ef8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -51,8 +51,6 @@ class TestCos(OpTest):
         self.check_output_with_place(self.place, atol=1e-7)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
index bd9022f56a3e77fa92c74637d5947869b201ac54..fea8502f2d7664b2717b42df9923171f880a1db2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
@@ -56,8 +56,6 @@ class TestDropoutOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 75c70e0a131ac996395427d9d3cdb7f2b7dd8ff7..f24c6c455a0cb306df4ea048641351c5309f5acd 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -65,36 +65,59 @@ class TestElementwiseAddOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out',
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.15, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.92, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.8, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.006, )
 
 
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
index 461e15352e3837f90dfa290bf32dddc2ab26b6b8..cbfc07f35447939c9db7e216db0d3a1f530630fe 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -116,19 +116,13 @@ class TestElementwiseMaxOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', no_grad_set=set("Y"))
 
@@ -213,15 +207,11 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp):
         self.out = np.maximum(self.x, self.y.reshape(1, 1, 100))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['Y'],
@@ -230,8 +220,6 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp):
             user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
index 51cf5cdaf6d1afb4a6aad64ddac4600b8d800358..e191224df81ee419e58b843dfd90b74c3fd113c1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -64,32 +64,41 @@ class TestElementwiseMinOp(OpTest):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out', )
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out', )
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"), )
+            self.check_grad_with_place(
+                self.place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"), )
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"), )
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.1)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"), )
 
 
 class TestElementwiseMinOpFp16(TestElementwiseMinOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
index ce645f317d054c264a730c150df42bccbfabbeee..907e149c8b2c3fb6093b279849a0aff48abfdb39 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -114,8 +114,6 @@ class TestElementwisePow(OpTest):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -184,8 +182,6 @@ class TestElementwisePowOp_broadcast_0(TestElementwisePow):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -218,8 +214,6 @@ class TestElementwisePowOp_broadcast_1(TestElementwisePow):
         self.out = np.power(self.x, self.y.reshape(1, 100, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -252,8 +246,6 @@ class TestElementwisePowOp_broadcast_2(TestElementwisePow):
         self.out = np.power(self.x, self.y.reshape(100, 1, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
index ccd5f0649d8dc68bb9cc8bb3e1736ced26c7cf7f..6be2fe0086b128851a79016fbeb2eaf705111199 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
@@ -50,8 +50,6 @@ class TestExpNPUOP(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 89ac9e09aa3488c25000c7801f108e036f33934e..83b65630d801a40aebf59e0f8e464aae5827d84a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -34,7 +34,7 @@ class TestExpand(OpTest):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
@@ -50,12 +50,8 @@ class TestExpand(OpTest):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestExpandV2(TestExpand):
@@ -66,7 +62,7 @@ class TestExpandV2(TestExpand):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
         expand_times = np.array([1, 10, 1]).astype(np.int32)
 
@@ -145,7 +141,7 @@ class TestExpand_expand_times_all_one(TestExpand):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 1, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
index d7aafccc88cf8d9ffde9c0b4923239abe14c3cc9..f1d89cb8d561b2cb0b10e94d0d1f084cf8733ea1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
@@ -59,9 +59,6 @@ class TestNPUHardSigmoid(OpTest):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
index 32042ba83a9f7723a03f3865319dafc13e1ae649..9495cdb8a55aa9e4e62ad66117cc9b41308d5d76 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
@@ -66,8 +66,6 @@ class TestHardSwishNPU(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # There is a problem that precision of grad result using float32
         # can't satisfy the default precision requirement 
         # when compared with numeric_grads, but the results on 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
index 1c9f499d22db42bf89a40b64e8f05a131785956e..a9c195bb8cd29f2c278ee974601eca1ad7e0358d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
@@ -81,13 +81,9 @@ class TestHuberLossOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'],
             'Out',
@@ -95,8 +91,6 @@ class TestHuberLossOp(OpTest):
             no_grad_set=set("residual"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'],
             'Out',
diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
index 6e5b4c012053f7e5e8cee28c7d54be3152ecb4cd..d02ddae461ba5c4182c03c70f7b7e39b639baa9d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
@@ -78,8 +78,10 @@ class TestLabelSmoothOp(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
index 590a961269989548ee03ed550bcb6ef3faa527f0..a0472f9611eb01c8230efae3555025967398f2f0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
@@ -63,8 +63,10 @@ class TestLeadyRelu(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLeadyReluFP16(TestLeadyRelu):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
index 9534431e99a7a2e0218fe08dfd95a770b9924915..5da3cb0ce56503da8edda2506077f7de273375ef 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -50,12 +50,8 @@ class TestLog(OpTest):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestLogFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
index f6baefec7f29e769a23a8777b0a5796289c6606d..10ec8621ffa58d9a2ada40f2ff6537a685094cc5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
@@ -63,9 +63,13 @@ class TestLogSoftmaxNPUOp(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'],
+                user_defined_grads=[self.x_grad],
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
 
 
 def test_class(op_type, typename):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index fefff0974ae40d2b9ac9d1a5f81410283cef0761..8ec9eb1cf3572703c656408da21a1f2f3d79123e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -77,8 +77,10 @@ class TestLookupTableV2(OpTest):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['W'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['W'], 'Out', max_relative_error=0.01)
+        else:
+            self.check_grad_with_place(self.place, ['W'], 'Out')
 
 
 class TestLookupTableV2FP16(TestLookupTableV2):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
index f3df1fca30749e55599c7f19d336ddb9ff41edbd..ec51dcf3f8e3e107574dc02ee69693664b74ff36 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
@@ -39,10 +39,11 @@ class TestNearestInterpOp(OpTest):
         self.set_npu()
         self.out_size = None
         self.actual_shape = None
+        self.init_dtype()
         self.data_layout = 'NCHW'
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
 
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
@@ -95,8 +96,21 @@ class TestNearestInterpOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006)
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.006)
+
+    def init_dtype(self):
+        self.dtype = np.float32
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -108,6 +122,11 @@ class TestNearestInterpOp(OpTest):
         self.align_corners = False
 
 
+class TestNearestNeighborInterpFP16(TestNearestInterpOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
index 2c41f09ff51488dd8e6eff48fa0dec0a6917bf50..8e28b3fe413b071d63123203d0f2a842f6d041ba 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
@@ -54,9 +54,6 @@ class TestNPUNormOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.006)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
index 3b75cba60b103fce118d2b0aca6eacf50fe9b809..a7ca4edc524be12e00536fa8f08bb6223004943f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
@@ -51,8 +51,6 @@ class TestPnormOp(OpTest):
             self.check_output_with_place(paddle.NPUPlace(0))
 
     def test_check_grad(self):
-        if self.dtype == "float16":
-            return
         self.check_grad_with_place(
             paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
index 7d6c3b9bdb444667e986fa92f6be5963eaf71f97..d1d2e8b3467be10ff075f357ccf7cb43ef263db7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
@@ -50,9 +50,10 @@ class TestPadOp(OpTest):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.6)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
         self.__class__.use_npu = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index 2b8550a88de592d70299111f8d33b4e978f2177a..4822abc3b25ebed695bc6d0a9fe6b564cef3ab63 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -67,9 +67,6 @@ def create_test_fp16_class(parent):
             self.use_cudnn = False
             self.dtype = np.float16
 
-        def test_check_grad(self):
-            return
-
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
     TestFp16Case.__name__ = cls_name
     globals()[cls_name] = TestFp16Case
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
index e8f5de005d421566451bff7a211961a311da3195..899d4ef43bd860251da77cb42b482343d5643fba 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
@@ -40,8 +40,6 @@ class TestNPUReciprocal(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
index 601a351c015f3258ebd23732dd0f76282e8f7d8e..b1cb5e02a731f8bbbc36097a73b609909fc2320b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -56,8 +56,6 @@ class TestRelu6(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
index a2547808e6f161ae1cdac5ea5944863d7c640d24..c909b14b5141fe1725e10642abec57eb416c1af8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -34,11 +34,12 @@ class TestRelu(OpTest):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def set_npu(self):
@@ -50,32 +51,18 @@ class TestRelu(OpTest):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
-class TestReluFp16(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "relu"
-        self.place = paddle.NPUPlace(0)
-
-        self.init_dtype()
-        np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
 
+class TestReluFp16(TestRelu):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
-
 
 class TestReluNeg(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
index 4516b25b59d9c080a4ff12de162440da1f196150..489f8bfb116a19cfaf3348f647cd584483788a75 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
@@ -44,8 +44,6 @@ class TestNPUSigmoid(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index 611691109e187b98d67379a3952fea0e0afd88e9..a5b203b6eea2a6c147194aabe36cbc6c600ae971 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -58,12 +58,17 @@ class TestSliceOp(OpTest):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        if self.dtype == np.float16:
+            self.check_output_with_place(self.place)
+        else:
+            self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOp2(TestSliceOp):
@@ -347,8 +352,10 @@ class TestSliceOpDecsDim(OpTest):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
index 8d78ee6a97efdd1df99c9636e8e18a2905d858a5..f0ca7788345765f5fcef3ebf23e5ca25bc97eaea 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -87,8 +87,6 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32
         self.check_grad_with_place(
             self.place, ['Logits'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
index acb99746d231ded16032bfdc1839b6b0f3120f62..24b34fa625c6339f6b990076fe6d0a874e7ba316 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -50,12 +50,11 @@ class TestSqrt(OpTest):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestSqrtFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
index caf55b4850f0b18f0fb20ed5692119c2b4ceccc2..170f6b6ca4f934c1bf29433502718b5fc35b25d4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -51,8 +51,6 @@ class TestSquare(OpTest):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
index 55be94da2b7e0346d8c6783d244c9d3a2c43273e..375eef12291ec50af416c38292adecf17fa83277 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -50,12 +50,11 @@ class TestTanh(OpTest):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestTanhFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 457f20ac5b06be0afb6929dc74148eb42b3ce4db..530ea2838a76fcf01a3047be56f46dea0232619e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -715,10 +715,11 @@ class OpTest(unittest.TestCase):
                 assert related_idx >= 0, "%d-th arguments don't have default value" % idx
                 return defaults[related_idx]
 
-            def remove_name(x):
-                if isinstance(x, list): return [i for i in x if i != 'name']
+            def filter_by_name(x):
+                names = set(['name', 'out', 'output'])
+                if isinstance(x, list): return [i for i in x if i not in names]
                 if isinstance(x, dict):
-                    return {k: v for k, v in x.items() if k != 'name'}
+                    return {k: v for k, v in x.items() if k not in names}
                 assert False, "Only support list or dict."
 
             def to_defaults_list(params, defaults):
@@ -728,7 +729,7 @@ class OpTest(unittest.TestCase):
             # Because we don't know the python api name of each arguments.
             # using parse_arg_and_kwargs, we can get the all api information we need.
             api_params, api_defaults = [
-                remove_name(item) for item in parse_arg_and_kwargs(api)
+                filter_by_name(item) for item in parse_arg_and_kwargs(api)
             ]
             api_defaults = to_defaults_list(api_params, api_defaults)
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
@@ -784,10 +785,10 @@ class OpTest(unittest.TestCase):
             block = fluid.default_main_program().global_block()
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
             # prepare input variable
-            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
-                                                          True, False, block)
+            eager_tensor_inputs = self.append_input_output_for_dygraph(
+                op_proto, self.inputs, True, False, block)
             # prepare output variable
-            outputs = self.append_input_output_for_dygraph(
+            eager_tensor_outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
             # prepare attrbutes
@@ -798,13 +799,14 @@ class OpTest(unittest.TestCase):
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
             kernel_sig = _dygraph_tracer()._get_kernel_signature(
-                self.op_type, inputs, outputs, attrs_outputs)
+                self.op_type, eager_tensor_inputs, eager_tensor_outputs,
+                attrs_outputs)
 
             assert hasattr(
                 self, "python_api"
             ), "Please set the `self.python_api` if you want to compare python api output."
-            args = prepare_python_api_arguments(self.python_api, inputs,
-                                                attrs_outputs, kernel_sig)
+            args = prepare_python_api_arguments(
+                self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig)
             """ we directly return the cal_python_api value because the value is already tensor. 
             """
             return cal_python_api(self.python_api, args, kernel_sig)
@@ -1286,11 +1288,11 @@ class OpTest(unittest.TestCase):
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
                     place, no_check_set=no_check_set)
-            # we only check end2end api when check_eager=True
-            if hasattr(self, "python_api"):
-                api_outs = self._calc_python_api_output(place)
-                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
-                                                     place)
+                # we only check end2end api when check_eager=True
+                if hasattr(self, "python_api"):
+                    api_outs = self._calc_python_api_output(place)
+                    self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                         place)
 
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 50ea065209422d2c972e480fbbd9a9442b5e5c25..6c964a828eed7eb01bce68b81baab61c66c5cf43 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -123,17 +123,26 @@ class XPUOpTest(OpTest):
             return super().check_grad_with_place(
                 place, inputs_to_check, output_names, no_grad_set,
                 numeric_grad_delta, in_place, max_relative_error,
-                user_defined_grads, user_defined_grads, check_dygraph)
+                user_defined_grads, user_defined_grad_outputs, check_dygraph)
 
         a1 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a2 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a3 = self.get_grad_with_place(
             paddle.CPUPlace(),
             inputs_to_check,
             output_names,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
         self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
@@ -147,7 +156,7 @@ class XPUOpTest(OpTest):
                             numeric_grad_delta=0.005,
                             in_place=False,
                             max_relative_error=0.005,
-                            user_defined_grads=None,
+                            user_defined_grad_outputs=None,
                             check_dygraph=True):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
@@ -197,6 +206,10 @@ class XPUOpTest(OpTest):
         if not type(output_names) is list:
             output_names = [output_names]
 
-        analytic_grads = self._get_gradient(inputs_to_check, place,
-                                            output_names, no_grad_set)
+        analytic_grads = self._get_gradient(
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         return analytic_grads
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c340c35d478d9576dcc3f1b15d4d2300692c5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+import numpy as np
+import random
+import socket
+
+import paddle
+import paddle.nn as nn
+from paddle.fluid.dygraph.nn import Linear
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+import paddle.distributed as dist
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.optimizer import SGD
+from paddle.fluid.initializer import NumpyArrayInitializer
+
+
+def net_is_used(port, ip='127.0.0.1'):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.connect((ip, port))
+        s.shutdown(2)
+        return True
+    except Exception as e:
+        return False
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    for port in range(20000, 21000):
+        if not net_is_used(port):
+            store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master,
+                                               nranks)
+            group = core.ProcessGroupNCCL(store, rank, nranks)
+            return group
+
+
+class LinearModel(nn.Layer):
+    def __init__(self, attr_list):
+        super(LinearModel, self).__init__()
+        self._linear1 = paddle.nn.Linear(
+            50, 30, weight_attr=attr_list[0], bias_attr=False)
+        self._linear2 = paddle.nn.Linear(
+            30, 10, weight_attr=attr_list[1], bias_attr=False)
+        self._linear3 = paddle.nn.Linear(
+            10, 10, weight_attr=attr_list[2], bias_attr=False)
+
+    def forward(self, x):
+        output = self._linear1(x)
+        output = self._linear2(output)
+        output = self._linear3(output)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        process_group = init_process_group()
+        self.generate_reducer("float32", process_group)
+        self.generate_reducer("float16", process_group)
+
+    def generate_reducer(self, dtype, process_group):
+        dev_id = ParallelEnv().dev_id
+        np.random.seed(2022 + dev_id)
+        paddle.set_default_dtype(dtype)
+
+        w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(50, 30).astype(dtype)))
+        w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(30, 10).astype(dtype)))
+        w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(10, 10).astype(dtype)))
+
+        attr_list = [w_1, w_2, w_3]
+        inp = np.random.rand(10, 50).astype(dtype)
+
+        # original reducer
+        params_a = self.model_train(attr_list, inp)
+
+        # refactored reducer in eager mode
+        with _test_eager_guard():
+            params_b = self.model_train(
+                attr_list, inp, process_group=process_group)
+
+        for i in range(len(params_a)):
+            np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy())
+
+    def model_train(self, attr_list, inp, process_group=None):
+        model = LinearModel(attr_list)
+        model = paddle.DataParallel(model, process_group=process_group)
+        optimizer = SGD(learning_rate=0.0003, parameters=model.parameters())
+
+        x = paddle.to_tensor(inp)
+        x.stop_gradient = False
+
+        for step in range(10):
+            y = model(x)
+            loss = y.mean()
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+        return model.parameters()
+
+
+class TestCatchErrors1(unittest.TestCase):
+    def test_multiple_gpus(self):
+        linear = paddle.nn.Linear(2, 4)
+        with _test_eager_guard():
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+class TestCatchErrors2(unittest.TestCase):
+    def test_multiple_gpus(self):
+        with _test_eager_guard():
+            linear = paddle.nn.Linear(2, 4)
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+if __name__ == '__main__':
+    dist.init_parallel_env()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..214f41c78a3a5b2c285c7b412241bb59c8ee0a75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+import paddle.fluid.core as core
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks)
+    group = core.ProcessGroupNCCL(store, rank, nranks)
+    return group
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        # self.register_buffer("queue", paddle.randn([10, 5]))
+        # self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        process_group = init_process_group()
+        self.pg = process_group
+        with _test_eager_guard():
+
+            model_a = SimpleNet(self.trainer_id)
+            model_b = SimpleNet(self.trainer_id)
+
+            state_dict = model_a.state_dict()
+            model_b.set_state_dict(state_dict)
+
+            model_a = paddle.DataParallel(
+                model_a,
+                find_unused_parameters=True,
+                process_group=process_group)
+            model_b = paddle.DataParallel(
+                model_b,
+                find_unused_parameters=True,
+                process_group=process_group)
+
+            ones_input = paddle.ones(shape=(batch, in_dim))
+            ones_input.stop_gradient = True
+
+            w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+            w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+            for step_id in range(5):
+                print("==============", step_id)
+                random_input = paddle.rand(shape=(batch, in_dim))
+                random_input.stop_gradient = True
+
+                if step_id % 2 == 0:
+                    out_a = model_a(random_input)
+                    out_b = model_b(random_input)
+                else:
+                    out_a = model_a(ones_input)
+                    out_b = model_b(ones_input)
+
+                out_a.sum().backward()
+                out_b.sum().backward()
+
+                self.check_gradient(model_a.parameters())
+                self.check_gradient(model_b.parameters())
+
+                # test acc gradient
+                w1_grad_sum = self.check_acc(model_a._layers.w1.grad,
+                                             w1_grad_sum,
+                                             model_b._layers.w1.grad)
+                w2_grad_sum = self.check_acc(model_a._layers.w2.grad,
+                                             w2_grad_sum,
+                                             model_b._layers.w2.grad)
+
+                model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad.numpy()
+            acc_grad = acc_grad.numpy() if acc_grad is not None else None
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        self.pg.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param.grad is not None):
+                grad = param.grad
+                other_grad = self.broadcast_param(grad, root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index c62c4615f74707796946137d3b44efc3cc8aeee9..b1f3a71ab3e94c7db53048b95d73795d155bd122 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -47,9 +47,7 @@ class TestProcessGroupFp32(unittest.TestCase):
             is_master = True if rank == 0 else False
             store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
                                                nranks, datetime.timedelta(0))
-            gloo_store = paddle.fluid.core.GlooStore(store)
-            opt = paddle.fluid.core.GlooOptions()
-            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+            pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks)
 
             # test allreduce sum
             # rank 0
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index bc280a01890d4a54f76026ccee31666c5f0ff2a8..83a25b71626e1b84ae0f85eeccee5423205dc978 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle
 import paddle.fluid.dygraph as dg
 from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestTensorBackward(unittest.TestCase):
@@ -29,7 +30,7 @@ class TestTensorBackward(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_tensor_backward(self):
+    def func_tensor_backward(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 100]).astype(dtype)
             y = np.random.random([100, 2]).astype(dtype)
@@ -48,6 +49,11 @@ class TestTensorBackward(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
+    def test_tensor_backward(self):
+        with _test_eager_guard():
+            self.func_tensor_backward()
+        self.func_tensor_backward()
+
 
 class TestBackwardAPI(unittest.TestCase):
     def setUp(self):
@@ -56,7 +62,7 @@ class TestBackwardAPI(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_backward_api(self):
+    def func_backward_api(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -78,7 +84,12 @@ class TestBackwardAPI(unittest.TestCase):
                     self.assertTrue(
                         np.allclose(x_grad * 2, x_tensor.grad.numpy()))
 
-    def test_backward_single_tensor(self):
+    def test_backward_api(self):
+        with _test_eager_guard():
+            self.func_backward_api()
+        self.func_backward_api()
+
+    def func_backward_single_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -97,7 +108,12 @@ class TestBackwardAPI(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_none_grad_tensor(self):
+    def test_backward_single_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_single_tensor()
+        self.func_backward_single_tensor()
+
+    def func_backward_none_grad_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -115,7 +131,12 @@ class TestBackwardAPI(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_accumulator_with_init_grad(self):
+    def test_backward_none_grad_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_none_grad_tensor()
+        self.func_backward_none_grad_tensor()
+
+    def func_backward_accumulator_with_init_grad(self):
         for dtype in self._dtypes:
             x = np.random.random([10, ]).astype(dtype)
             y_grad = np.random.random([10, ]).astype(dtype)
@@ -134,11 +155,14 @@ class TestBackwardAPI(unittest.TestCase):
 
                     y = x**2
                     z = x**3
-                    x_grad = 2 * x_tensor * (
-                        y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor)
+                    x_grad = 2 * x * (y_grad + 3 * y * y * z_grad)
 
-                    self.assertTrue(
-                        np.allclose(x_grad.numpy(), x_tensor.grad.numpy()))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+    def test_backward_accumulator_with_init_grad(self):
+        with _test_eager_guard():
+            self.func_backward_accumulator_with_init_grad()
+        self.func_backward_accumulator_with_init_grad()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 0371fa054282bb009889c90c9de4da58894fad8f..9f727608f816c4e818f50f12d4d5cc1fccf04bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -44,6 +44,10 @@ class TestDiagV2Op(OpTest):
         paddle.enable_static()
         self.check_output(check_eager=True)
 
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out', check_eager=True)
+
     def init_config(self):
         pass
 
@@ -62,14 +66,14 @@ class TestDiagV2OpCase2(TestDiagV2Op):
 
 class TestDiagV2OpCase3(TestDiagV2Op):
     def init_config(self):
-        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64")
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase4(TestDiagV2Op):
     def init_config(self):
         self.x = np.random.rand(100)
-        self.padding_value = 8
+        self.padding_value = 2
         n = self.x.size
         self.out = self.padding_value * np.ones((n, n)) + np.diag(
             self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 156fdcb9b0abe1ea2dcca0e15bbcfec87b8ebf7a..98ef339e04535bb943add02b6cf6efe490f0354b 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -50,9 +50,9 @@ class EagerScaleTestCase(unittest.TestCase):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             out_eager.backward(grad_eager, False)
-            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertIsNotNone(data_eager.grad)
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
 
     def test_retain_grad_and_run_backward_raises(self):
@@ -72,7 +72,7 @@ class EagerScaleTestCase(unittest.TestCase):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             with self.assertRaisesRegexp(
                     AssertionError,
                     "The type of grad_tensor must be paddle.Tensor"):
@@ -632,13 +632,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
             tensor2.persistable = True
             tensor2.stop_gradient = False
             if core.is_compiled_with_cuda():
-                tensor3 = tensor2._copy_to(True, core.CUDAPlace(0))
+                tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_gpu_place())
             else:
-                tensor3 = tensor2._copy_to(True, core.CPUPlace())
+                tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 13e763bee630517d1caaa5432f86882c37aab449..43b5ce96a390150db7e29588e4107271b240b23f 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -26,14 +26,14 @@ class TestErrors(unittest.TestCase):
     def test_diagonalize_errors(self):
         a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
         a = paddle.to_tensor(a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('...ii->...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i->i...', a)
 
     def test_param_errors(self):
@@ -396,6 +396,51 @@ class TestNumpyTests(unittest.TestCase):
         self.check_output('a...b,b...c,c...a', a, a, a)
         self.check_output('...ab,...ba,...ab,...ab', a, a, a, a)
 
+    def test_static_graph(self):
+        paddle.enable_static()
+        fluid = paddle.fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            a = paddle.static.data(
+                name='a', shape=[3, None, None, None], dtype='float')
+            b = paddle.static.data(
+                name='b', shape=[2, None, None, None], dtype='float')
+            c = paddle.static.data(
+                name='c', shape=[None, None, 2, None], dtype='float')
+            d = paddle.static.data(
+                name='d', shape=[None, None, 5], dtype='float')
+            e = paddle.static.data(
+                name='e', shape=[None, 2, None], dtype='float')
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum('...ik, ...j', c, d))
+            outs.append(paddle.einsum('...kj, ...ik', d, e))
+            outs.append(paddle.einsum('ijk..., ikj', c, e))
+            outs.append(paddle.einsum('ijk..., ikj->...ij', c, e))
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype('float')
+        b = np.arange(48).reshape(2, 2, 3, 4).astype('float')
+        c = np.arange(48).reshape(2, 3, 2, 4).astype('float')
+        d = np.arange(30).reshape(2, 3, 5).astype('float')
+        e = np.arange(12).reshape(2, 2, 3).astype('float')
+        feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum('...ik, ...j', c, d))
+        expect.append(np.einsum('...kj, ...ik', d, e))
+        expect.append(np.einsum('ijk..., ikj', c, e))
+        expect.append(np.einsum('ijk..., ikj->...ij', c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d1d391a3949ead28697c0756803e873c41914079..318e826058f2c111f825b113c8ee4676ff87d630 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 00967cb503fe5fd677839a869798964bb5fb0b71..b35b2840ed30a2650e6e19a4cfbc381f50fd5024 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index d2bffbe074f2a9bf63975831560597067508aaf5..0ae005430e03b046d609c393fcc0641a0d3db49e 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             set(parameters),
             set([
                 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
-                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
-                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
-                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask',
+                'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0',
                 'fc_2.b_0_velocity_0'
             ]))
         self.assertEqual(ops, [
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index cd4ba5b054264afca65d4c4d8359eb1854fbb658..7436e9eb7b12623296d7a714e742cc4212c4ca91 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101
 import unittest
 from unittest import TestCase
 import numpy as np
+import paddle.compat as cpt
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'):
     return fluid.dygraph.to_variable(x_np)
 
 
+class TestEagerGrad(TestCase):
+    def func_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = fluid.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+
+    def test_simple_example_eager_grad(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad()
+        self.func_simple_example_eager_grad()
+
+    def func_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertEqual(dx[1], None)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_allow_unused()
+        self.func_simple_example_eager_grad_allow_unused()
+
+    def func_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = fluid.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_not_allow_unused()
+        self.func_simple_example_eager_grad_not_allow_unused()
+
+
 class TestDygraphDoubleGrad(TestCase):
     def setUp(self):
         self.sort_sum_gradient = False
@@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase):
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = fluid.layers.relu(x)
+        y2 = fluid.layers.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = fluid.layers.reduce_mean(w)
+        del y1, z, w
+
+        dx_actual, = self.grad(
+            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
+                       (x_np > 0) * 2).astype('float32')
+
+        self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
+
+    def test_example_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_no_grad_vars()
+        self.func_example_no_grad_vars()
+
+    @dygraph_guard
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -214,25 +337,33 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward(retain_graph=True)
-
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
-
-        for i in range(5):
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
             loss.backward(retain_graph=True)
+
             x_grad_actual = x.gradient()
-            x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+            x_grad_expected = (2.0 / float(numel) * (
                 x_np + dx_expected *
                 (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
+            for i in range(5):
+                loss.backward(retain_graph=True)
+                x_grad_actual = x.gradient()
+                x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+                    x_np + dx_expected *
+                    (x_np > 0) * 2 / float(numel))).astype('float32')
+                self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -289,12 +428,20 @@ class TestDygraphDoubleGrad(TestCase):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
@@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
 
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
-    def test_compare(self):
+    def func_compare(self):
         value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
                                                           5).astype("float32")
 
@@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
 
         self.assertTrue(np.array_equal(grad_1, grad_2))
 
+    def test_compare(self):
+        with _test_eager_guard():
+            self.func_compare()
+        self.func_compare()
+
 
 class TestRaiseNoDoubleGradOp(TestCase):
     def raise_no_grad_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a434c56200061b656bc2daa0e66069f09b6949cf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -0,0 +1,397 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestDygraphInplace(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def non_inplace_api_processing(self, var):
+        return paddle.squeeze(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.squeeze_(var)
+
+    def test_inplace_api(self):
+        with _test_eager_guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            inplace_var = self.inplace_api_processing(var)
+            self.assertTrue(id(var) == id(inplace_var))
+
+            inplace_var.exp_()
+            self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy()))
+
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                self.assertEqual(var.inplace_version, 0)
+
+                inplace_var = self.inplace_api_processing(var)
+                self.assertEqual(var.inplace_version, 1)
+
+                inplace_var.exp_()
+                self.assertEqual(var.inplace_version, 2)
+
+                inplace_var = self.inplace_api_processing(inplace_var)
+                self.assertEqual(var.inplace_version, 3)
+
+    def test_leaf_inplace_var_error(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                var.stop_gradient = False
+
+                def leaf_inplace_error():
+                    self.inplace_api_processing(var)
+
+                self.assertRaises(ValueError, leaf_inplace_error)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                # Here, the gradient computation will use the value of var_b
+                var_c = var_b**2
+                self.inplace_api_processing(var_b)
+
+                loss = paddle.nn.functional.relu(var_c)
+                with self.assertRaisesRegexp(
+                        RuntimeError,
+                        "received current_inplace_version:{} != inplace_version_snapshot_:{}".
+                        format(1, 0)):
+                    loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                # Here, the gradient computation will use the value of var_b
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.non_inplace_api_processing(var_b)
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.non_inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+
+
+class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.unsqueeze(var, -1)
+
+    def inplace_api_processing(self, var):
+        return paddle.unsqueeze_(var, -1)
+
+
+class TestDygraphInplaceReshape(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.reshape(var, [-1])
+
+    def inplace_api_processing(self, var):
+        return paddle.reshape_(var, [-1])
+
+
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
+class TestDygraphInplaceScatter(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter(var, index, updates, overwrite=False)
+
+    def inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter_(var, index, updates, overwrite=False)
+
+
+class TestDygraphInplaceElu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.elu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.elu_(var)
+
+
+class TestDygraphInplaceRelu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.relu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.relu_(var)
+
+
+class TestDygraphInplaceSoftmax(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax_(var)
+
+
+class TestDygraphInplaceTanh(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.tanh(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.tanh_(var)
+
+
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add_(input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract_(input_var_2)
+
+
+class TestLossIsInplaceVar(unittest.TestCase):
+    def test_loss_is_inplace_var(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh_()
+
+                loss.backward()
+                inplace_grad_var_a = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
+
+
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        with _test_eager_guard():
+            a = paddle.rand([2, 3])
+            a.stop_gradient = False
+            b = a * 2
+
+            b.reshape_([-1])
+            b.reshape_([2, 3])
+            b.reshape_([-1])
+
+            b.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index ca9a489c7496f33cb084f1cd43158cebc7a1add6..b75dc2c964ca0b22219de1b33cdbfc3d74c19e45 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -215,6 +215,8 @@ class TestLayerNormOp(unittest.TestCase):
                                   for name in ['x', 'scale', 'bias', 'y@GRAD']
                               },
                               fetch_list=fetch_list)
+                # print(y)
+                # print(out[0])
                 self.__assert_close(y, out[0], "y")
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
@@ -238,6 +240,7 @@ class TestLayerNormOp(unittest.TestCase):
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(
             shape=[2, 3, 4, 5],
@@ -432,4 +435,5 @@ class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index d0a40f38ba25721b6f285b48d45d7a3ead37bfee..65d0e289f81329561eaec73d10aa639689f0e1d3 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -542,7 +542,7 @@ class TestComplexMatMulOp(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+            check_eager=True)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -560,7 +560,7 @@ class TestComplexMatMulOp(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+            check_eager=True)
 
 
 class TestComplexMatMulOpBroadcast(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df9d2a3a41b44c18b7e008a271c10544ec4dfa0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import op_test
+import numpy as np
+import unittest
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.models.moe import utils
+
+
+def count(x, upper_range):
+    res = np.zeros((upper_range, )).astype(int)
+    for i in x.reshape(-1):
+        if i >= 0 and i < len(res):
+            res[i] += 1
+    return res
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountOpInt64(op_test.OpTest):
+    def setUp(self):
+        expert_num = 16
+        self.op_type = "number_count"
+        x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64')
+        self.inputs = {'gate_idx': x}
+        self.outputs = {'Out': count(x, expert_num)}
+        self.attrs = {"upper_range": expert_num}
+
+    def test_forward(self):
+        self.check_output_with_place(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountAPI(unittest.TestCase):
+    def setUp(self):
+        self.upper_range = 320
+        self.x = np.random.randint(
+            -1, self.upper_range, size=(6000, 200)).astype('int64')
+        self.out = count(self.x, self.upper_range)
+        self.place = paddle.CUDAPlace(0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('x', self.x.shape, dtype="int64")
+            out = utils._number_count(x, self.upper_range)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'x': self.x}, fetch_list=[out])
+            assert np.allclose(res, self.out)
+
+    def test_api_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = utils._number_count(x, self.upper_range)
+        assert np.allclose(out.numpy(), self.out)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 66de1b309797fb53316a46b436e5cccf11410216..fac258192112dbff5353c581ad8e276cc5e375c0 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -22,7 +22,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
+from paddle.framework import _in_eager_mode
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
 
 
 class TestOneHotOp(OpTest):
@@ -45,7 +46,7 @@ class TestOneHotOp(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_attr(OpTest):
@@ -68,7 +69,7 @@ class TestOneHotOp_attr(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype(OpTest):
@@ -91,7 +92,7 @@ class TestOneHotOp_default_dtype(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -114,7 +115,7 @@ class TestOneHotOp_default_dtype_attr(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_out_of_range(OpTest):
@@ -132,7 +133,7 @@ class TestOneHotOp_out_of_range(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_exception(unittest.TestCase):
@@ -190,6 +191,12 @@ class TestOneHotOpApi(unittest.TestCase):
             one_hot_label = fluid.one_hot(
                 input=fluid.dygraph.to_variable(label), depth=depth)
 
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            with _test_eager_guard():
+                one_hot_label = paddle.nn.functional.one_hot(
+                    paddle.to_tensor(label), depth)
+
     def _run(self, depth):
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         one_hot_label = fluid.one_hot(input=label, depth=depth)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 2ffe523ef6dda18a24813e702a1892c335ba6a68..531e9663a2b728a2871dff404425b063a0c47e67 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@ import unittest
 from unittest import TestCase
 import numpy as np
 import paddle
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase):
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 2 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
+
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -279,12 +312,20 @@ class TestDygraphDoubleGrad(TestCase):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e31356a6bc81c1684a3620d36b66ed441add40b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import gc
+import sys
+import unittest
+import time
+import paddle
+import paddle.incubate.multiprocessing as mp
+
+REPEAT = 20
+HAS_SHM_FILES = os.path.isdir('/dev/shm')
+
+
+def fill_tensor(queue, event):
+    data = queue.get()
+    with paddle.no_grad():
+        data[0][:] = 5
+        data[1][:] = 5
+
+    event.set()
+
+
+def send_tensor(queue, event, device, dtype):
+    tensor = paddle.ones([5, 5], dtype=dtype)
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+def send_parambase(queue, event, device, dtype):
+    tensor = paddle.nn.Layer().create_parameter(
+        [5, 5],
+        dtype=dtype,
+        default_initializer=paddle.nn.initializer.Constant(value=1.0))
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+class leak_checker(object):
+    def __init__(self, test_case):
+        self.checked_pids = [os.getpid()]
+        self.test_case = test_case
+
+    def __enter__(self):
+        self.next_fds = self._get_next_fds(10)
+        return self
+
+    def __exit__(self, *args):
+        if args[0] is None:
+            self.test_case.assertFalse(self.has_shm_files())
+        return False
+
+    def check_pid(self, pid):
+        self.checked_pids.append(pid)
+
+    def _get_next_fds(self, n=1):
+        fds = [os.dup(0) for i in range(n)]
+        for fd in fds:
+            os.close(fd)
+        return fds
+
+    def has_shm_files(self, wait=True):
+        if not HAS_SHM_FILES:
+            return False
+        result = self._has_shm_files()
+        if result and wait:
+            time.sleep(0.5)
+            return self._has_shm_files()
+        return result
+
+    def _has_shm_files(self):
+        gc.collect()
+        names = ['paddle_' + str(pid) for pid in self.checked_pids]
+        for filename in os.listdir('/dev/shm'):
+            for name in names:
+                if filename.startswith(name):
+                    print("have", filename)
+                    return True
+        return False
+
+
+class TestMultiprocessingBase(unittest.TestCase):
+    def get_tensor(self, device="cpu"):
+        self.device = device.lower()
+        place = None
+        tensor = paddle.zeros([5, 5], dtype="float32")
+        return tensor
+
+    def get_parameter(self):
+        w = paddle.nn.Layer().create_parameter(
+            [10, 10],
+            default_initializer=paddle.nn.initializer.Constant(value=0.0))
+        return w
+
+    def _test_empty(self, dtype="float32"):
+        q = mp.Queue()
+        empty = paddle.to_tensor([], dtype=dtype)
+        q.put(empty)
+        out = q.get(timeout=1)
+        self.assertEqual(str(out), str(empty))
+
+    def _test_sharing(self,
+                      ctx=mp,
+                      device='cpu',
+                      dtype="float32",
+                      repeat=1,
+                      param=False):
+        def test_fill():
+            if param:
+                x = self.get_parameter()
+                y = (x[:, 1]).detach()
+            else:
+                x = self.get_tensor()
+                y = x[:, 1]
+
+            data = [x, y]
+
+            queue = ctx.Queue()
+            event = ctx.Event()
+            queue.put(data)
+
+            process = ctx.Process(target=fill_tensor, args=(queue, event))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            event.wait(30)
+
+            self.assertTrue(event.is_set())
+            self.assertTrue(data[0].equal(5).all())
+            self.assertTrue(data[1].equal(5).all())
+
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        def test_receive():
+            queue = ctx.Queue()
+            event = ctx.Event()
+
+            process = ctx.Process(
+                target=send_parambase if param else send_tensor,
+                args=(queue, event, device, dtype))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            t1 = queue.get()
+            t2 = queue.get()
+            self.assertTrue(t1.equal(1).all())
+            del t1, t2
+
+            event.set()
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        with leak_checker(self) as lc:
+            for _ in range(repeat):
+                test_fill()
+                test_receive()
+
+
+class TestMultiprocessingCpu(TestMultiprocessingBase):
+    def test_pass_tensor(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=REPEAT)
+
+    def test_pass_parambase(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=1, param=True)
+
+    def test_pass_empty(self):
+        paddle.set_device("cpu")
+        self._test_empty()
+
+
+class TestMultiprocessingGpu(TestMultiprocessingBase):
+    @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def test_pass_tensor(self):
+        paddle.set_device("gpu")
+        self._test_sharing(mp.get_context("spawn"), "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index edf9aed04f5e0a1df3ceb1ec0add27251e2264a5..2530fc07753e8fac56cedff1a6a9798a42059dcb 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,5 +200,15 @@ class TestDataParallelWithPyLayer(TestMultipleGpus):
         self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py')
 
 
+class TestDataParallelInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py')
+
+
+class TestGradientCheckInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 4361a45f1568f5f047ee03090bd3ef28a8d6654f..2380ccb14aaeede7474c99ccbd2d5805849c2118 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -18,6 +18,7 @@ from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
 from paddle.static import program_guard, Program
+import os
 
 
 def check_randperm_out(n, data_np):
@@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.randperm(30000, dtype='int32').numpy()
+        expect = [
+            24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='int64').numpy()
+        expect = [
+            6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float32').numpy()
+        expect = [
+            5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
+            22906., 10705.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
+            26991.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
+            4662.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float64').numpy()
+        expect = [
+            19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
+            9163.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
+            17624.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
+            21404.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe5fb9bb9455aa58d84bda03f9e9e16038a3be0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import subprocess
+import sys, os
+import json
+import shutil
+
+import random
+
+from os import listdir
+from os.path import isfile, join
+
+pyname = 'train.py'
+colpyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_MASTER" in env
+assert "PADDLE_GLOBAL_SIZE" in env
+assert "PADDLE_LOCAL_SIZE" in env
+assert "PADDLE_GLOBAL_RANK" in env
+assert "PADDLE_LOCAL_RANK" in env
+'''
+
+pspyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
+assert "PADDLE_TRAINER_ENDPOINTS" in env
+#assert "PADDLE_PSERVER_ENDPOINTS" in env
+#assert "PADDLE_TRAINER_ENDPOINTS" in env
+#assert "PADDLE_ROLE" in env
+#assert "PADDLE_RANK" in env
+'''
+
+
+def write_file(name, ct):
+    with open(name, "w") as f:
+        f.write(ct)
+
+
+def get_files(pth, prefix):
+    return [
+        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+    ]
+
+
+class Collective_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, colpyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    '''
+    def test_collective_1(self):
+        args = "--id test1"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    '''
+
+    def test_collective_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id test2 --devices 0,1,2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'test2')
+        self.assertTrue(len(c) == 4)
+
+    def test_collective_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'test3')
+        self.assertTrue(len(c) == 6)
+
+
+class PS_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, pspyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    '''
+    def test_ps_1(self):
+        args = "--mode ps"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_ps_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id ps2 --server_num=2 --trainer_num=2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'ps2')
+        self.assertTrue(len(c) == 5)
+    '''
+
+    def test_ps_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'ps3')
+        self.assertTrue(len(c) == 6)
+
+    def test_ps_4(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        p1 = self.pdrun(args)
+        p1.wait()
+        self.assertTrue(p1.poll() == 0)
+
+        c = get_files('log', 'ps4')
+        self.assertTrue(len(c) == 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8284771920e81db10d22f08cc96ecc58c422833d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestSparseUtils(unittest.TestCase):
+    def test_to_sparse_coo(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            #TODO(zhangkaihuo): change to test the corresponding API
+            out = _C_ops.final_state_to_sparse_coo(dense_x, 2)
+            print(out)
+            assert np.array_equal(out.non_zero_indices().numpy(),
+                                  non_zero_indices)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+    def test_to_sparse_csr(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_crows = [0, 2, 3, 5]
+            non_zero_cols = [1, 3, 2, 0, 1]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            out = _C_ops.final_state_to_sparse_csr(dense_x)
+            print(out)
+            assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows)
+            assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index a3bfe3864a2493fdcf100a1a86648a159701ec11..beaf361379b94dd28997a6186a58608694a20eca 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -333,7 +333,8 @@ class TestVariable(unittest.TestCase):
         with self.assertRaises(IndexError):
             res = x[[True, False, False]]
         with self.assertRaises(ValueError):
-            res = x[[False, False]]
+            with paddle.static.program_guard(prog):
+                res = x[[False, False]]
 
     def test_slice(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index d50c0fecdeebc79a98f66037080d1a03d73f3924..66f2e871dac462c8e6e47357e7367755d2fc0cfc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -474,5 +474,473 @@ def ref_softplus(x, beta=1, threshold=20):
     return out
 
 
+# XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h
+class XPUTestBReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'brelu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestBRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "brelu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-5, 10, [10, 12]).astype(self.dtype)
+            t_min = 1.0
+            t_max = 4.0
+            # The same with TestAbs
+            x[np.abs(x - t_min) < 0.005] = t_min + 0.02
+            x[np.abs(x - t_max) < 0.005] = t_max + 0.02
+            t = np.copy(x)
+            t[t < t_min] = t_min
+            t[t > t_max] = t_max
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': t}
+            self.attrs = {'use_xpu': True, 't_min': t_min, 't_max': t_max}
+
+
+support_types = get_xpu_op_support_types('brelu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestBReluOP, stype)
+
+
+class XPUTestCeilOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'ceil'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCeil(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "ceil"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.ceil(x)
+
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('ceil')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeilOP, stype)
+
+
+class XPUTestCeluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'celu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "celu"
+            self.dtype = self.in_type
+
+            alpha = 1.5
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_celu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('celu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeluOP, stype)
+
+
+def ref_celu(x, alpha):
+    out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestEluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'elu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestElu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "elu"
+            self.dtype = self.in_type
+
+            alpha = 1.
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_elu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('elu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestEluOP, stype)
+
+
+def ref_elu(x, alpha):
+    out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestFloorOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'floor'
+        self.use_dynamic_create_class = False
+
+    class XPUTestFloor(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "floor"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.floor(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('floor')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFloorOP, stype)
+
+
+class XPUTestHardShrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_shrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardShrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_shrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            # self.set_attrs()
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
+            out = ref_hardshrink(x, threshold)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_shrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardShrinkOP, stype)
+
+
+def ref_hardshrink(x, threshold):
+    out = np.copy(x)
+    out[(out >= -threshold) & (out <= threshold)] = 0
+    return out
+
+
+class XPUTestHardSigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_sigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardSigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_sigmoid"
+            self.dtype = self.in_type
+            self.slope = 0.166666666666667
+            self.offset = 0.5
+
+            x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+            lower_threshold = -self.offset / self.slope
+            upper_threshold = (1. - self.offset) / self.slope
+
+            # Same reason as TestAbs
+            delta = 0.005
+            x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+            x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
+
+            out = ref_hardsigmoid(x, self.slope, self.offset)
+
+            self.attrs = {
+                'use_xpu': True,
+                'slope': self.slope,
+                'offset': self.offset
+            }
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_sigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardSigmoidOP, stype)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
+class XPUTestLog1pOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'log1p'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLog1p(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "log1p"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+            out = np.log1p(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('log1p')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLog1pOP, stype)
+
+
+class XPUTestLogsigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'logsigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLogsigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "logsigmoid"
+            self.dtype = self.in_type
+
+            np.random.seed(2048)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.log(1 / (1 + np.exp(-x)))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('logsigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLogsigmoidOP, stype)
+
+
+class XPUTestRelu6OP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'relu6'
+        self.use_dynamic_create_class = False
+
+    class XPUTestRelu6(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "relu6"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_relu6(x)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('relu6')
+for stype in support_types:
+    create_test_class(globals(), XPUTestRelu6OP, stype)
+
+
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
+class XPUTestSiluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'silu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSilu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "silu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = x / (np.exp(-x) + 1)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('silu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSiluOP, stype)
+
+
+class XPUTestSoftReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'soft_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "soft_relu"
+            self.dtype = self.in_type
+
+            np.random.seed(4096)
+            x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
+            threshold = 2.0
+            # The same reason with TestAbs
+            x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+            x[np.abs(x + threshold) < 0.005] = -threshold - 0.02
+            t = np.copy(x)
+            t[t < -threshold] = -threshold
+            t[t > threshold] = threshold
+            out = np.log((np.exp(t) + 1))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'threshold': threshold}
+
+
+support_types = get_xpu_op_support_types('soft_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftReluOP, stype)
+
+
+class XPUTestSoftSignOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softsign'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftSign(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softsign"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_softsign(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softsign')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftSignOP, stype)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
+class XPUTestSoftshrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softshrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftshrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softshrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            np.random.seed(1023)
+            x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+            out = ref_softshrink(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softshrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftshrinkOP, stype)
+
+
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
+class XPUTestSwishOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'swish'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSwish(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "swish"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_swish(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('swish')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSwishOP, stype)
+
+
+def ref_swish(x):
+    from scipy.special import expit
+    out = x * expit(x)
+    return out
+
+
+class XPUTestThresholdedReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'thresholded_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestThresholdedRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "thresholded_relu"
+            self.dtype = self.in_type
+
+            threshold = 1.0
+            np.random.seed(1024)
+            x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_thresholded_relu(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('thresholded_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestThresholdedReluOP, stype)
+
+
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
index 2ad79dd0cca00585b01065e1ae6fbb34da4970d4..9999217041859f43a26b5cb071a2f4942634de2d 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -21,6 +21,8 @@ import random
 import sys
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
@@ -73,188 +75,198 @@ def seqconv(x,
     return np.dot(col, filter)
 
 
-class TestSeqProject(XPUOpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-        self.use_xpu = True
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(-6.10907e-05, 0.000104218,
-                              [self.input_size[0],
-                               self.input_size[1]]).astype('float32')
-        w = np.random.uniform(-3.17068e-05, 0.000159822, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0, 0, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_input(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
+class XPUTestSequenceConv(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sequence_conv'
+
+    class TestSeqProject(XPUOpTest):
+        def setUp(self):
+            self.init_test_case()
+            self.op_type = 'sequence_conv'
+            self.dtype = self.in_type
+            self.use_xpu = True
+
+            if self.context_length == 1 \
+                    and self.context_start == 0 \
+                    and self.padding_trainable:
+                print("If context_start is 0 " \
+                      "and context_length is 1," \
+                      " padding_trainable should be false.")
+                return
+
+            # one level, batch size
+            x = np.random.uniform(-6.10907e-05, 0.000104218,
+                                  [self.input_size[0],
+                                   self.input_size[1]]).astype(self.dtype)
+            w = np.random.uniform(-3.17068e-05, 0.000159822, [
+                self.context_length * self.input_size[1],
+                self.output_represention
+            ]).astype(self.dtype)
+
+            begin_pad = np.max([0, -self.context_start])
+            end_pad = np.max([0, self.context_start + self.context_length - 1])
+            total_pad = begin_pad + end_pad
+            padding_data = np.random.uniform(
+                0, 0, [total_pad, self.input_size[1]]).astype(self.dtype)
+            self.pad_data = padding_data
+            self.inputs = {
+                'X': (x, self.lod),
+                'Filter': w,
+            }
+            self.inputs_val = ['X', 'Filter']
+            self.inputs_val_no_x = ['Filter']
+            self.inputs_val_no_f = ['X']
+
+            if total_pad != 0:
+                self.inputs['PaddingData'] = padding_data
+                self.inputs_val = ['X', 'PaddingData', 'Filter']
+                self.inputs_val_no_x = ['PaddingData', 'Filter']
+                self.inputs_val_no_f = ['PaddingData', 'X']
+
+            self.attrs = {
+                'contextStart': self.context_start,
+                'contextLength': self.context_length,
+                'paddingTrainable': self.padding_trainable,
+                'contextStride': self.context_stride
+            }
+            out = seqconv(x, self.lod, w, self.context_length,
+                          self.context_start, self.padding_trainable,
+                          self.pad_data)
+            self.outputs = {'Out': out}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad_input(self):
+            self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
+
+        def test_check_grad_padding_data(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+
+        def test_check_grad_Filter(self):
             self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 7
-        self.input_col = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[0, 1, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 25]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase4(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 7835
-        self.input_col = 128
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[
-            0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515,
-            516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202,
-            1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914,
-            2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606,
-            2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097,
-            3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010,
-            4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604,
-            4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260,
-            5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939,
-            6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
-            6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699,
-            7827, 7835
-        ]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
+                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+
+        def test_check_grad_input_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+
+        def test_check_grad_padding_input(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+
+        def test_check_grad_padding_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+
+        def init_test_case(self):
+            self.input_row = 7
+            self.input_col = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[0, 1, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase1(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 4, 5, 8, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase2Len0(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase3(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 25]
+            idx = list(range(self.input_size[0]))
+            del idx[0]
+            offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                          [self.input_size[0]]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase4(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 7835
+            self.input_col = 128
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[
+                0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387,
+                515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073,
+                1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876,
+                1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475,
+                2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838,
+                2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520,
+                3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219,
+                4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694,
+                4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440,
+                5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021,
+                6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
+                6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595,
+                7699, 7827, 7835
+            ]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+
+support_types = get_xpu_op_support_types('sequence_conv')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSequenceConv, stype)
 
 
 class TestSeqConvApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
index 8f3578b526e1e5fbfaf2ad27c84bef5134f17d5f..3d7c9959db9ea28ac6f6ecd0050878eee15e6cbd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -18,169 +18,174 @@ import sys
 import unittest
 sys.path.append("..")
 from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase1(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, 2:-1, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase2(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, :, 2:-1]
+class XPUTestSliceOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [3, 3, 4]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1:3, 0:3, 2:4, :]
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestCase1(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+    class TestCase2(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
 # 1.2 with attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1, 0, 2]
-        self.ends = [1000000, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1]
-        self.ends = [1000000]
-        self.axes = [3]
-        self.decrease_axis = [3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[:, :, :, -1]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
+class XPUTestSliceOp_decs_dim(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp_decs_dim(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                'decrease_axis': self.decrease_axis,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 3, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0:3, 2:4, :]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1, 0, 2]
+            self.ends = [1000000, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+    class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1]
+            self.ends = [1000000]
+            self.axes = [3]
+            self.decrease_axis = [3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[:, :, :, -1]
+
+    class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+
+support_types = get_xpu_op_support_types('slice')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSliceOp, stype)
+    create_test_class(globals(), XPUTestSliceOp_decs_dim, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
index d010e1633578ed6f4a237dbda2641b1b563633ee..cd18bd63a88f7c4366470d4d0854f4951e1ba46d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -24,221 +24,158 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid import core
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 np.random.seed(10)
 
 
 #Situation 1: repeat_times is a list (without tensor)
-class TestTileOpRank1(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {'repeat_times': self.repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-#with dimension expanding
-class TestTileOpRank2Expanding(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [120]
-        self.repeat_times = [2, 2]
-
-
-class TestTileOpRank2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-class TestTileOpRank3_Corner(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (1, 1, 1)
-
-
-class TestTileOpRank3_Corner2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (2, 2)
-
-
-class TestTileOpRank3(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 15)
-        self.repeat_times = (2, 1, 4)
-
-
-class TestTileOpRank4(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5, 7)
-        self.repeat_times = (3, 2, 1, 2)
+class XPUTestTileOpRank1(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+            self.attrs = {'repeat_times': self.repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    #with dimension expanding
+    class TestTileOpRank2Expanding(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [120]
+            self.repeat_times = [2, 2]
+
+    class TestTileOpRank2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+    class TestTileOpRank3_Corner(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (1, 1, 1)
+
+    class TestTileOpRank3_Corner2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (2, 2)
+
+    class TestTileOpRank3(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 15)
+            self.repeat_times = (2, 1, 4)
+
+    class TestTileOpRank4(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 5, 7)
+            self.repeat_times = (3, 2, 1, 2)
 
 
 # Situation 2: repeat_times is a list (with tensor)
-class TestTileOpRank1_tensor_attr(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-        repeat_times_tensor = []
-        for index, ele in enumerate(self.repeat_times):
-            repeat_times_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'repeat_times_tensor': repeat_times_tensor,
-        }
-        self.attrs = {"repeat_times": self.infer_repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-        self.infer_repeat_times = [-1]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [1, 1]
-        self.infer_repeat_times = [1, -1]
-
-
-class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-        self.infer_repeat_times = [-1, 3]
+class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor_attr(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            repeat_times_tensor = []
+            for index, ele in enumerate(self.repeat_times):
+                repeat_times_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'repeat_times_tensor': repeat_times_tensor,
+            }
+            self.attrs = {"repeat_times": self.infer_repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+            self.infer_repeat_times = [-1]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [1, 1]
+            self.infer_repeat_times = [1, -1]
+
+    class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+            self.infer_repeat_times = [-1, 3]
 
 
 # Situation 3: repeat_times is a tensor
-class TestTileOpRank1_tensor(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-# Situation 4: input x is Integer
-class TestTileOpInteger(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(4, 4, 5)).astype("int32")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 5: input x is Integer
-class TestTileOpInt64_t(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 6: input x is Bool
-class TestTileOpBool(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("bool")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+class XPUTestTileOpRank1_tensor(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+            }
+            self.attrs = {}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+
+support_types = get_xpu_op_support_types('tile')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTileOpRank1, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor, stype)
 
 
 # Test python API
diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c23be3a89411da702e3071fec4c99186fca4b9
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .reductions import init_reductions
+import multiprocessing
+
+__all__ = []
+
+from multiprocessing import *  # noqa: F403
+
+__all__ += multiprocessing.__all__  # type: ignore[attr-defined]
+
+# Only support linux for now
+# Only support file_system sharing strategy.
+
+init_reductions()
diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbc55afd3bca87aa279c7aa251aa23671b1a317
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/reductions.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+# TODO: check the hooks of tensor
+# TODO: check serializing named tensor
+# TODO: check influence on autograd
+import os
+import sys
+import warnings
+import math
+import copy
+import threading
+import multiprocessing
+from multiprocessing.util import register_after_fork
+from multiprocessing.reduction import ForkingPickler
+
+from collections import OrderedDict
+
+
+def _supported_check():
+    if sys.platform != "linux":
+        # warnings.warn("`paddle.multiprocessing` only support linux for now, "
+        #               " import this will not take any effect !")
+
+        return False
+
+    if not sys.version_info >= (3, 4):
+        warnings.warn("Use `paddle.multiprocessing` to share paddle tensor "
+                      "requires python version greater than 3.4 ."
+                      " `paddle.multiprocessing` will not take any effect !!!")
+        return False
+
+    return True
+
+
+class LRUSharedCache(OrderedDict):
+    def __init__(self):
+        self.limit = 128
+        self._after_fork()
+        register_after_fork(self, LRUSharedCache._after_fork)
+
+    def _after_fork(self):
+        self.lock = threading.Lock()
+
+    def get(self, key):
+        with self.lock:
+            try:
+                value = super().pop(key)
+                super().__setitem__(key, value)
+                return value
+            except KeyError:
+                return None
+
+    def __setitem__(self, key, value):
+        with self.lock:
+            try:
+                super().__delitem__(key)
+            except KeyError:
+                if len(self) >= self.limit:
+                    super().popitem(last=False)
+            super().__setitem__(key, value)
+
+
+shared_cache = LRUSharedCache()
+
+
+def cuda_from_cache(key):
+    lodtensor = shared_cache.get(key)
+    if lodtensor is None:
+        return None
+    return lodtensor
+
+
+def rebuild_tensor(cls, lodtensor, metadata):
+    if cls == paddle.fluid.framework.ParamBase:
+        tensor = paddle.fluid.framework.ParamBase(lodtensor.shape(),
+                                                  lodtensor._dtype(),
+                                                  **metadata)
+        tensor.value().get_tensor()._share_data_with(lodtensor)
+    else:
+        size, stop_gradient = metadata
+        tensor = paddle.fluid.core.VarBase()
+        if lodtensor._is_initialized():
+            tensor.value().get_tensor()._share_data_with(lodtensor)
+        else:
+            tensor = paddle.to_tensor([], dtype=lodtensor._dtype())
+        tensor.stop_gradient = stop_gradient
+    return tensor
+
+
+def reduce_tensor(tensor):
+    lodtensor = tensor.value().get_tensor()
+
+    if not tensor.stop_gradient and not tensor.is_leaf:
+        raise RuntimeError(
+            "Refusing to serialize non-leaf tensor which not stop_gradient, you can detach it!"
+        )
+    # TODO: add serializing name and  hooks check
+    if tensor.place.is_cpu_place() or tensor.place.is_gpu_place(
+    ) or tensor.place.is_cuda_pinned_place():
+        if type(tensor) == paddle.fluid.framework.ParamBase:
+            metadata = copy.deepcopy(tensor.__dict__)
+        else:
+            metadata = (tensor.size, tensor.stop_gradient)
+
+        return (rebuild_tensor, (type(tensor), lodtensor, metadata))
+    else:
+        raise ValueError(
+            "Only support tensors of CPU/CUDA/CUDAPinned Place, Not support %s for now!"
+            % tensor.place)
+
+
+def rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod):
+    lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod))
+    lodtensor._shared_decref()
+    return lodtensor
+
+
+def rebuild_cuda_tensor(cls, handle, offset_bytes, size, type_idx, dims, lod,
+                        device_idx):
+    cache_tensor = cuda_from_cache((handle, offset_bytes))
+    if cache_tensor is None:
+        lodtensor = cls._new_shared_cuda(
+            (handle, offset_bytes, size, type_idx, dims, lod, device_idx))
+        # We only cache cuda shared tensor here.
+        # The opening cost of cudaIpcMemoryHandle is very high.
+        # Since we cache the recived tensor directly,
+        # The sender may reallocate the tensor space,
+        # you should manualy maintian the lifecycle of ipc tensor
+        shared_cache[(handle, offset_bytes)] = lodtensor
+    else:
+        lodtensor = paddle.fluid.core.LoDTensor()
+        lodtensor._share_buffer_with(cache_tensor,
+                                     (size, type_idx, dims, lod, device_idx))
+
+    return lodtensor
+
+
+def rebuild_lodtensor_empty(cls):
+    #TODO: check if tensor initialized
+    #TODO: handle the dtype of empty tensor
+    return cls()
+
+
+def reduce_lodtensor(lodtensor):
+    if lodtensor._place().is_cpu_place() or lodtensor._place(
+    ).is_cuda_pinned_place():
+        for dim in lodtensor.shape():
+            if dim == 0:
+                # Empty tensors have nothing be mmapped.
+                return (rebuild_lodtensor_empty, (type(lodtensor), ))
+
+        # Default use share filename stratege
+        metadata = lodtensor._share_filename(
+        )  # ipc_name, size, type_idx, dims, lod
+        rebuild = rebuild_lodtensor_filename
+        lodtensor._shared_incref()
+        # TODO, maintain reference for lodtensor
+        # TODO: support file_discriptor stratege
+    elif lodtensor._place().is_gpu_place():
+        metadata = lodtensor._share_cuda()
+        rebuild = rebuild_cuda_tensor
+    else:
+        raise RuntimeError("We only support pass cpu/gpu lodtensor for now!")
+
+    return (rebuild, (type(lodtensor), ) + metadata)
+
+
+def init_reductions():
+    if not _supported_check():
+        return
+
+    ForkingPickler.register(paddle.Tensor, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.VarBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.framework.ParamBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.LoDTensor, reduce_lodtensor)
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 9f577d5ff38024fe9264deec2980ff091996a1d8..2d0b079ee9280e2dd0cc0e62c6c5932565ba9dfd 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None):
     where sum is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (Tensor): A tensor, available data type float32, float64.
+        data (Tensor): A tensor, available data type float32, float64, int32, int64.
         segment_ids (Tensor): A 1-D tensor, which have the same size
                             with the first dimension of input data. 
                             Available data type is int32, int64.
@@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None):
     of all index 'segment_ids[j] == i'.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size 
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None):
     where min is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None):
     where max is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index de8a7ff6d3c7b6cd87d6301f2cd0bb7af119a74d..4c30ed03735f26b6df77c6a8f5b32391972738e5 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,6 +19,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
 from paddle import in_dynamic_mode
+from paddle.framework import _in_eager_mode
 
 __all__ = []
 
@@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None):
     """
 
     if in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_one_hot(x, num_classes)
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 47dc02705f80bee3ce614846a82c7e44140247b1..96f35eb9d27ec86baa9a7311a4a85a217a7499b8 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,6 +42,7 @@ from .. import compat as cpt
 from .lr import LRScheduler
 import copy
 from paddle import _C_ops
+from paddle.fluid.framework import _in_eager_mode
 
 __all__ = []
 
@@ -1108,7 +1109,13 @@ class Optimizer(object):
                 for p in param_group['params']:
                     if not p.stop_gradient:
                         param_list.append(p)
-        core.clear_gradients(param_list, set_to_zero)
+
+        if _in_eager_mode():
+            for p in param_list:
+                clear_func = p._zero_grads if set_to_zero else p.clear_gradient
+                clear_func()
+        else:
+            core.clear_gradients(param_list, set_to_zero)
 
     @imperative_base.no_grad
     def minimize(self,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 5167c18de179dabc4b25bf077d6a81b6ef0b8bf6..6c575b4b997d661d8be79c4e0b457c6a2f34795c 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -59,16 +59,14 @@ class SGD(Optimizer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
             sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
-            back = out.backward()
+            out.backward()
             sgd.step()
             sgd.clear_grad()
 
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f06c45cc369737403025ed264815a98b81acc6da..7c0c71951aa1d7a566cabf73ecb9d26e03b8dab6 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -193,7 +193,7 @@ class InputSpec(object):
                 print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
 
         """
-        if isinstance(tensor, (Variable, core.VarBase)):
+        if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
             return cls(tensor.shape, tensor.dtype, name or tensor.name)
         else:
             raise ValueError(
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 040480c26faa8fca3e9e08cf0b69cc6cdaaeedfc..06c2a82fd696d8f84f6ce1b38fad95834d10d3a0 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import itertools
+import numpy as np
 import re
 
-from .linalg import matmul, transpose
+from .linalg import dot, matmul, transpose
 from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
@@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
         f"Invalid equation: duplicate output labels are found.")
 
 
-#     '''
-#     Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. 
-#     We follow the Numpy broadcasting convention which states that, by lining up the shape arrays
-#     starting from the right most dimension, all the aligned dimensions either have equal sizes or
-#     one of them is sized one.
-#     Parameters
-#     ----------
-#     args:
-#         *args unpacks into operand one's axes range, shape, operand two's axes range, shape
-#     f: 
-#         if available, is used as a callback for postprocessing the aligned operand dimensions.
-#     '''
-#     xran, xshape, yran, yshape = args
-#
-#     xran_inv, yran_inv = xran[::-1], yran[::-1]
-#
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         xs, ys = xshape[xi], yshape[yi]
-#         cond = xs == ys or xs == 1 or ys == 1
-#         if not cond:
-#             return False
-#
-#     if not f:
-#         return True
-#
-#     # Apply the callback to each aligned dimension pair
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         f(xi, yi)
-
-
 def build_view(in_labels, out_labels):
     '''
     Build an inverse map of dimension indices. Three conditions must hold for 
@@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes):
 
     g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
 
-    g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes]
+    g_masks = [[s > 1 or s == -1 for s in view_shape]
+               for view_shape in view_shapes]
 
     return g_shape, g_masks
 
 
-def dim_strides(shape):
-    '''
-    Returns the dimension strides for a tensor shape
-    '''
-    strides = []
-    stride = 1
-    for size in shape[::-1]:
-        strides.append(stride)
-        stride = stride * size
-    return strides
-
-
-def create_view(operand, *view_def):
-    '''
-    Create and materialize a view.
-    
-    Parameters
-    ----------
-    operand:
-        the base tensor operand
-    view_def: 
-        include two lists which define the view's dimension sizes and strides
-    '''
-    assert False, f'Diagonal and trace not implemented yet.'
-    view_shape, view_strides = view_def
-    return operand.create_view(view_shape, view_strides)
-
-
 def has_duplicated_labels(labels):
     '''
     Returns True if there is any duplicate label.
@@ -337,46 +281,17 @@ def diagonalize(labels, operand):
     Merges dimensions with duplicate labels. 
     
     For those dimensions with duplicate labels, merge them into one dimension
-    which represents the diagonal elements. That requires the duplicate labeled
-    dimensions equal sized. The order of dimensions is kept unchanged up to 
-    the left-most appearance of each label.
+    which represents the diagonal elements. This requires the dimensions with
+    duplicate labels are equal sized.
     
     Examples
     -------- 
     'ijj...i' would be merged into 'ij...'
     '''
-    if not has_duplicated_labels(labels):
-        return labels, operand
-
-    strides = dim_strides(operand.shape)
-    shape = operand.shape
-    new_labels = []
-    new_shape = []
-    new_strides = []
-
-    for ax, l in enumerate(labels):
-        if l == '.' or l not in new_labels:
-            # not duplicate
-            new_labels.append(l)
-            new_strides.append(strides[ax])
-            new_shape.append(shape[ax])
-        else:
-            # duplicate label
-            diag_ax = new_labels.index(l)
-            new_strides[diag_ax] += strides[ax]
+    assert not has_duplicated_labels(labels), (
+        f'Duplicate labels are not supported.')
 
-    # Call framework API to build a new tensor
-    new_op = create_view(operand, new_shape, new_strides)
-    return new_labels, new_op
-
-
-def prod(iter, default=1):
-    if len(iter):
-        res = 1
-        for s in iter:
-            res *= s
-        return res
-    return default
+    return labels, operand
 
 
 def plan_reduce(plan, op, reduce_dims, keepdim):
@@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
 
     op1_view, op2_view = [g_view[op] for op in (op1, op2)]
 
-    # Note, I may index into -1
-    I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0]
-    I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0]
-    J1_dims = [op1_view[ax] for ax in J1]
-    J2_dims = [op2_view[ax] for ax in J2]
-    K1_dims = [op1_view[ax] for ax in K]
-    K2_dims = [op2_view[ax] for ax in K]
+    I1 = [idx for idx in I if op1_view[idx] >= 0]
+    I2 = [idx for idx in I if op2_view[idx] >= 0]
+    op1_view = np.array(op1_view)
+    op1_dims = op1_view[I1 + J1 + K]
 
-    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
-    op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)]
-    op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)]
-
-    I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes]
-                                    for axes in (I, J1, K)]
-    I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes]
-                                    for axes in (I, J2, K)]
+    op2_view = np.array(op2_view)
+    op2_dims = op2_view[I2 + J2 + K]
 
-    K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape)
+    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
+    op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)])
+    op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)])
+    vshape = np.maximum(op1_vshape, op2_vshape)
 
-    perm1 = I1_dims + J1_dims + K1_dims
-    perm2 = I2_dims + J2_dims + K2_dims
+    i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K))
 
-    if any(i != dim for i, dim in enumerate(perm1)):
+    if any(op1_dims != np.arange(len(op1_dims))):
         # print(f'perm1: {perm1}')
-        step = transpose, [var1], var1, perm1
+        step = transpose, [var1], var1, list(op1_dims)
         plan.add_step(step)
 
-    if any(i != dim for i, dim in enumerate(perm2)):
+    if any(op2_dims != np.arange(len(op2_dims))):
         # print(f'perm2: {perm2}')
-        step = transpose, [var2], var2, perm2
+        step = transpose, [var2], var2, list(op2_dims)
         plan.add_step(step)
 
-    # In case of no K... dimensions, do a broadcast
-    if not K:
-        # unsqueeze operands include J1...J2... dimensions
-        if J2:
-            fill_start = len(I2_dims) + len(J1)
-            fill_end = fill_start + len(J2)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var1], var1, fill
-            plan.add_step(step)
-        if J1:
-            fill_start = len(I2_dims)
-            fill_end = fill_start + len(J1)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var2], var2, fill
-            plan.add_step(step)
-        # make broadcast
-        step = multiply, [var1, var2], var2
-        plan.add_step(step)
-    # K... are there, let's reason about I... and J...
-    # In case I... and J... are empty, do the vector-vector version of matmul
-    elif not I and not J1 and not J2:
-        # merge K dimensions
-        if len(K) > 1:
-            for var in var1, var2:
-                step = reshape, [var], var, [K1_size]
-                plan.add_step(step)
-        # Build vector-vector matmul
-        step = matmul, [var1, var2], var2
-        plan.add_step(step)
-    # General case, there are K... and some I... and J..., the actual operation will be 
-    # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes.
-    else:
-        # Merge J dims and K dims by reshaping
-        merged_shape1 = I1_shape + [J1_size] + [K1_size]
-        merged_shape2 = I2_shape + [J2_size] + [K1_size]
+    # Check if conditions hold for turnning the operation into a matmul
+    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
+        (op1_vshape, op2_vshape)):
+        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
+                                           ] + [np.prod(op1_vshape[K])]
+        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
+                                           ] + [np.prod(op2_vshape[K])]
 
-        step = reshape, [var1], var1, merged_shape1
+        # Merge J dims and K dims by reshaping
+        step = reshape, [var1], var1, op1_shape
         plan.add_step(step)
-        step = reshape, [var2], var2, merged_shape2
+        step = reshape, [var2], var2, op2_shape
         plan.add_step(step)
 
         # Matmul
         step = matmul, [var1, var2], var2, False, True
         plan.add_step(step)
 
-    # The result shape is in I..., J1, J2. Let's reshape back to known dimensions
-    # Note, this is static deduction, not by reading the tensor shape at runtime
-    result_shape = [1] * len(I)
-    for i, ax in enumerate(I):
-        result_shape[i] = max(op1_vshape[ax], op2_vshape[ax])
-    if J1:
-        result_shape += J1_shape
-    if J2:
-        result_shape += J2_shape
-
-    # Need a scalar dimension somehow
-    if result_shape:
-        step = reshape, [var2], var2, result_shape
+        # Reshape back
+        shape = list(vshape[I + J1 + J2])
+        step = reshape, [var2], var2, shape
         plan.add_step(step)
 
+    elif j1 == j2 == k == 1:
+        # Can still do matmul even unknown shapes are present
+        step = matmul, [var1, var2], var2, False, True
+        plan.add_step(step)
+
+    # In the rest cases we opt for ops other than matmul 
+    else:
+        # unsqueeze operands include J1...J2... dimensions
+        if j2:
+            fill = list(range(i1 + j1, i1 + j1 + j2))
+            step = unsqueeze, [var1], var1, fill
+            plan.add_step(step)
+        if j1:
+            fill = list(range(i2, i2 + j1))
+            step = unsqueeze, [var2], var2, fill
+            plan.add_step(step)
+        # In case of no dimensions to contract, do an elementwise multiply
+        if k == 0:
+            # make broadcast
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+        # Contract and no join, turn into a dot
+        elif j1 + j2 == 0 and k == 1:
+            step = unsqueeze, [var1], var1, [-2]
+            plan.add_step(step)
+            step = unsqueeze, [var2], var2, [-1]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        elif j1 + j2 == 0 and not-1 in np.concatenate(
+            (op1_vshape[K], op2_vshape[K])):
+            assert all(op1_vshape[K] == op2_vshape[K])
+            step = reshape, [var1], var1, list(op1_vshape[
+                I]) + [1] + [np.prod(op1_vshape[K])]
+            plan.add_step(step)
+            step = reshape, [var2], var2, list(op2_vshape[
+                I]) + [1] + [np.prod(op2_vshape[K])]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2, False, True
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        else:
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+            reduce_dims = list(range(-k, 0))
+            plan_reduce(plan, op2, reduce_dims, keepdim=False)
+
     # Wrap up, updating auxiliary data
     # Updating g_mask for I and J axes
-    for i, ax in enumerate(I + J1 + J2):
-        op2_mask[ax] = (result_shape[i] > 1)
+    for ax in I + J1 + J2:
+        op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1
 
     for ax in K:
         op2_mask[ax] = False
@@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
     for ax in I + J1 + J2:
         op2_view[ax], dim = dim, dim + 1
 
+    g_view[op2] = list(op2_view)
+
 
 def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
                    n_bcast):
@@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
-@dygraph_only
 def einsum(equation, *operands):
     r"""
     einsum(equation, *operands)
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 85672ec7a36e698b199669c167488ced17d51837..f164bbc466f18da9b7145533c32369a85d6124df 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -263,14 +263,7 @@ def to_string(var, prefix='Tensor'):
         data=data)
 
 
-def tensor_to_string(tensor, prefix='Tensor'):
-    indent = len(prefix) + 1
-
-    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
-
-    if not tensor._is_initialized():
-        return "Tensor(Not initialized)"
-
+def _format_dense_tensor(tensor, indent):
     np_tensor = tensor.numpy()
 
     if len(tensor.shape) == 0:
@@ -288,6 +281,26 @@ def tensor_to_string(tensor, prefix='Tensor'):
 
     data = _format_tensor(
         np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+    return data
+
+
+def sparse_tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{data})"
+    if tensor.is_sparse_coo():
+        indices_tensor = tensor.non_zero_indices()
+        elements_tensor = tensor.non_zero_elements()
+        indices_data = _format_dense_tensor(indices_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_indices=' + indices_data + ',\nnon_zero_elements=' + elements_data
+    else:
+        crows_tensor = tensor.non_zero_crows()
+        cols_tensor = tensor.non_zero_cols()
+        elements_tensor = tensor.non_zero_elements()
+        crows_data = _format_dense_tensor(crows_tensor, indent)
+        cols_data = _format_dense_tensor(cols_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_crows=' + crows_data + ',\nnon_zero_cols=' + cols_data + ',\nnon_zero_elements=' + elements_data
 
     return _template.format(
         prefix=prefix,
@@ -297,3 +310,25 @@ def tensor_to_string(tensor, prefix='Tensor'):
         stop_gradient=tensor.stop_gradient,
         indent=' ' * indent,
         data=data)
+
+
+def tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    if not tensor._is_initialized():
+        return "Tensor(Not initialized)"
+
+    if tensor.is_sparse():
+        return sparse_tensor_to_string(tensor, prefix)
+    else:
+        data = _format_dense_tensor(tensor, indent)
+        return _template.format(
+            prefix=prefix,
+            shape=tensor.shape,
+            dtype=tensor.dtype,
+            place=tensor._place_str,
+            stop_gradient=tensor.stop_gradient,
+            indent=' ' * indent,
+            data=data)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 6c27d465cb12e3c89391608f5ea9871a5a42ddef..70dea65b7699b413f0dc5fc8d68599229beb3078 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -141,6 +141,14 @@
   output : Tensor
   invoke : full_like(x, 1, dtype, place)
 
+- api : pool2d
+  args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
+  output : Tensor(out)
+  infer_meta :
+    func : PoolInferMeta
+  kernel:
+    func : pool2d
+
 - api : reshape
   args : (Tensor x, ScalarArray shape)
   output : Tensor(out)
@@ -150,6 +158,15 @@
     func : reshape
   inplace : (x -> out)
 
+- api : relu
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : relu
+  inplace : (x -> out)
+
 - api : scale
   args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
   output : Tensor
@@ -158,6 +175,7 @@
     param : [x]
   kernel :
     func : scale, scale_sr
+  inplace : (x -> out)
 
 - api : sign
   args : (Tensor x)
@@ -167,6 +185,14 @@
   kernel :
     func : sign
 
+- api : softmax
+  args : (Tensor x, int axis)
+  output : Tensor
+  infer_meta :
+    func : SoftmaxInferMeta
+  kernel :
+    func : sotfmax
+
 - api : split
   args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
   output : Tensor[]
@@ -194,6 +220,15 @@
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
 
+
+- api : one_hot
+  args : (Tensor x, Scalar num_classes)
+  output : Tensor
+  infer_meta :
+    func : OneHotInferMeta
+  kernel :
+    func : one_hot
+    
 - api : digamma
   args : (Tensor x)
   output : Tensor
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index fe68548a22a6d90bececdd00ac75d760969cee92..bf3d7b3d19eab806706f1d2d654957aac5b33434 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -696,8 +696,9 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
             code_indent)
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
-{code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+{code_indent}  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
 {code_indent}  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
 {code_indent}  VLOG(6) << "{self.api} API kernel: " << kernel;
@@ -709,7 +710,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
@@ -719,6 +723,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent,
             inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
 {code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -732,7 +737,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index a404fc01784154900a7c6ac1df501424dcdb307e..98a3606952bbb13d3b20c55427b9747f1a4a5624 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -147,6 +147,9 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/declarations.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 7417d6bb030da095daf29db080b524db034cdcc6..5506f71f4b671da282fb933436b3c17d4a47a8fb 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -154,6 +154,8 @@ def source_include(header_file_path):
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 2f233a2df357df478c96bed2c40e28e8e972f660..2d1fe78b55981c4d0b39848bb81ae4ea8fcc690b 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,21 +1,22 @@
 - api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   kernel :
     func : sparse_conv3d
     layout : x
+  backward : conv3d_grad
 
 - api : to_dense
-  args : (Tensor x, Backend backend)
+  args : (Tensor x)
   output : Tensor(out@DenseTensor)
-  invoke : to_dense_impl(x, backend)
+  invoke : to_dense_impl(x)
 
 - api : to_sparse_coo
-  args : (Tensor x, Backend backend, int64 sparse_dim)
+  args : (Tensor x, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
-  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+  invoke : to_sparse_coo_impl(x, sparse_dim)
 
 - api : to_sparse_csr
-  args : (Tensor x, Backend backend)
+  args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
-  invoke : to_sparse_csr_impl(x, backend)
+  invoke : to_sparse_csr_impl(x)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index dd22e16dc64f0a12c3575b6f2a0d2c21cd97955b..b4fc7638622b9ec7cfbac12b3d7c831fb6b25ec7 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -192,9 +192,7 @@ def source_include(header_file_path):
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 8c9f02ebb3198670fffa4ddd80d14798b6fe78a9..6532f103cbf86288ffc739656440dc378d48eb2d 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,6 +1,6 @@
 - backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
   kernel :
-    func : sparse_conv_grad
+    func : sparse_conv3d_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 561e198a41b99c2362f5e14ac7fcd6da051c8875..5dac7c8c48367b0cb3a880ab8614de5cbec58257 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -115,9 +115,7 @@ def source_include(header_file_path):
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 853a98a62b504d94617127bd35212d2412719e1c..b0a5d37a535df7e83c08f18e624402294bf29539 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -146,6 +146,9 @@ def custom_write_stub(resource, pyfile):
         import types
         import paddle
         
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        so_path = os.path.join(cur_dir, "{resource}")
+        
         def inject_ext_module(module_name, api_names):
             if module_name in sys.modules:
                 return sys.modules[module_name]
@@ -157,9 +160,6 @@ def custom_write_stub(resource, pyfile):
             return new_module
 
         def __bootstrap__():
-            cur_dir = os.path.dirname(os.path.abspath(__file__))
-            so_path = os.path.join(cur_dir, "{resource}")
-
             assert os.path.exists(so_path)
 
             # load custom op shared library with abs path
@@ -169,6 +169,7 @@ def custom_write_stub(resource, pyfile):
         __bootstrap__()
 
         {custom_api}
+
         """).lstrip()
 
     # Parse registerring op information
@@ -900,7 +901,7 @@ def _generate_python_module(module_name,
     # delete the temp file before exit python process    
     atexit.register(lambda: remove_if_exit(api_file))
 
-    # write into .py file with RWLock
+    # write into .py file with RWLockc
     api_content = [_custom_api_content(op_name) for op_name in op_names]
     with open(api_file, 'w') as f:
         f.write('\n\n'.join(api_content))
@@ -911,13 +912,15 @@ def _generate_python_module(module_name,
 
 
 def _custom_api_content(op_name):
-    params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
-
+    params_str, ins_str, attrs_str, outs_str, in_names, attrs_names = _get_api_inputs_str(
+        op_name)
+    lower_in_names = [p.split("@")[0].lower() for p in in_names]
     API_TEMPLATE = textwrap.dedent("""
-        from paddle.fluid.core import VarBase
-        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
+        import paddle.fluid.core as core
+        from paddle.fluid.core import VarBase, CustomOpKernelContext
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer, _in_eager_mode
         from paddle.fluid.layer_helper import LayerHelper
-
+        
         def {op_name}({inputs}):
             # prepare inputs and outputs
             ins = {ins}
@@ -928,9 +931,20 @@ def _custom_api_content(op_name):
             # The output variable's dtype use default value 'float32',
             # and the actual dtype of output variable will be inferred in runtime.
             if in_dygraph_mode():
-                for out_name in out_names:
-                    outs[out_name] = VarBase()
-                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+                if _in_eager_mode():
+                    ctx = CustomOpKernelContext()
+                    for i in {in_names}:
+                        ctx.add_inputs(i)
+                    for j in {attr_names}:
+                        ctx.add_attr(j)
+                    for out_name in out_names:
+                        outs[out_name] = core.eager.Tensor()
+                        ctx.add_outputs(outs[out_name])
+                    core.eager._run_custom_op(ctx, "{op_name}", True)
+                else:
+                    for out_name in out_names:
+                        outs[out_name] = VarBase()
+                    _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
             else:
                 helper = LayerHelper("{op_name}", **locals())
                 for out_name in out_names:
@@ -949,6 +963,9 @@ def _custom_api_content(op_name):
         inputs=params_str,
         ins=ins_str,
         attrs=attrs_str,
+        # "[x, y, z]""
+        in_names="[" + ",".join(lower_in_names) + "]",
+        attr_names="[" + ",".join(attrs_names) + "]",
         out_names=outs_str)
 
     return api_content
@@ -996,7 +1013,7 @@ def _get_api_inputs_str(op_name):
     ])
     # e.g: ['Out', 'Index']
     outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names])
-    return params_str, ins_str, attrs_str, outs_str
+    return params_str, ins_str, attrs_str, outs_str, in_names, attr_names
 
 
 def _write_setup_file(name,
diff --git a/python/setup.py.in b/python/setup.py.in
index 689f63c0f00e95e3eb861ca1c497685babd01638..44998bd3e1675f2a3f77edd26c9cd8fa85121b6a 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -300,6 +300,7 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.distributed.auto_parallel',
           'paddle.distributed.auto_parallel.operators',
+          'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.passes',
           'paddle.framework',
           'paddle.jit',
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2a9fb842862c2e733376d7eee985b428e497b9f9..5466a1cdd597b0f466d3a0a25def932d6a6be098 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
     bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1
 fi
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
+# remove line ended with .exe to get correct deleted_ut list
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
 if [[ "$SYSTEM" == 'Linux' ]];then
@@ -66,6 +67,8 @@ rm -rf prec_build
 if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    # get the deleted ut list in windows, will be used in check_change_of_unittest.sh
+    grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh
 fi
 git checkout -f $CURBRANCH
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 55d2d59c7ece6a4639b1227f600a7d208a69f2e7..9c802a56a7b6e29bc89ad164a15f2f6d4749734e 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future.
+NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true`
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
 if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
     check_approval 1 6836917 47554610 22561442
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index f754767259563f2cd64bac92adf76249b18af11f..ae0316036f1854e281e07de59fb5aa53201bd35e 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
 }
 
-def PD_ReturnOp : PD_Op<"return", [Terminator]> {
-  let summary = "return Op";
-
-  let description = [{
-    Fetch tensor from the graph.
-  }];
-
-  let arguments = (ins Variadic<PD_Tensor>:$inputs);
-}
-
 def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
@@ -52,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<(ins "Attribute":$value)>,
+    OpBuilder<(ins "mlir::Attribute":$value)>,
   ];
 }
diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
index 027dfe4328a55ff246928cbc9ab6d3d36f15e1fd..b0e420da64aa280b71859b27334a2abeaaacc53b 100644
--- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
+++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
@@ -16,8 +16,6 @@ import paddle.fluid.framework as framework
 from paddle.fluid import core
 from paddle import compat as cpt
 
-ops_having_canonicalization = {"elementwise_add", }
-
 
 # collect original ops: op which has both inference and grid defination
 def get_original_ops():
@@ -186,16 +184,31 @@ def generate_all_ops_inputs_outputs_map(op_descs):
     cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};"
 
     # 3. Write to header file
-    dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h"
+    dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h"
     with open(dst_head_file, 'w') as ops_inputs_outputs_head_file:
         ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str)
         ops_inputs_outputs_head_file.write("\n\n")
         ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str)
 
 
+def get_constraint(op_type, op_proto):
+    # 2.3.1 inputs
+    constraint = "NoSideEffect"
+
+    optional_input_num_ = 0
+    for input_ in op_proto[INPUTS]:
+        if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][
+                INTERMEDIATE] != True and op_proto[INPUTS][input_][
+                    DISPENSABLE] == True:
+            optional_input_num_ += 1
+    if optional_input_num_ > 1:
+        constraint += ", AttrSizedOperandSegments"
+    return constraint
+
+
 # funtion to generate paddle op dialect file
 def convert_op_proto_into_mlir(op_descs):
-    dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td"
+    dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td"
     custom_dialect_file = "custom_pdop.td"
 
     # 1. Head files
@@ -214,7 +227,7 @@ def convert_op_proto_into_mlir(op_descs):
         "include \"mlir/Interfaces/InferTypeOpInterface.td\"",
         "include \"mlir/Interfaces/LoopLikeInterface.td\"",
         "include \"mlir/IR/OpBase.td\"",
-        "include \"paddle/infrt/dialect/pd_op_base.td\"",
+        "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"",
         "",
     ]
 
@@ -239,13 +252,14 @@ def convert_op_proto_into_mlir(op_descs):
         if (op_type in skipped_op_list) or (op_type not in original_ops_):
             continue
         automatically_generated_op_dialect.append(op_type)
+        constraint_ = get_constraint(op_type, op_proto)
         # 2.1 OpDef
-        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format(
+        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format(
             op_type_capitalize=op_type.capitalize(),
+            constraint=constraint_,
             op_type=op_type,
             left_brace="{")
         SUMMARY = '  let summary = "{} op";\n'.format(op_type)
-        CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else ""
 
         # 2.2 Description
         contents = ""
@@ -259,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs):
         ARGUMENTS = ""
         if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0):
             ARGUMENTS = "  let arguments = (ins "
+
             # 2.3.1 inputs
             for input_ in op_proto[INPUTS]:
                 if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][
                         input_][INTERMEDIATE] != True:
-                    if op_proto[INPUTS][input_][DUPLICABLE] != "true":
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                    if op_proto[INPUTS][input_][DISPENSABLE] != True:
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
                     else:
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor>:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor_Array>:$" + input_ + ","
+
             # unsupported:   BLOCK = 8;  BLOCKS = 10;
             attr_mlir_converter = {
                 0: 'SI32Attr',
@@ -335,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs):
             for output_ in op_proto[OUTPUTS]:
                 if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[
                         OUTPUTS][output_][INTERMEDIATE] != True:
-                    if op_proto[OUTPUTS][output_][DUPLICABLE] != "true":
+                    if op_proto[OUTPUTS][output_][DUPLICABLE] != True:
                         outputs = outputs + "PD_Tensor:${},".format(output_)
                     else:
                         outputs = outputs + "PD_Tensor_Array:${},".format(
@@ -348,7 +370,6 @@ def convert_op_proto_into_mlir(op_descs):
             ops_mlir_file.write(DESCRIPTION)
             ops_mlir_file.write(ARGUMENTS)
             ops_mlir_file.write(RESULTS)
-            ops_mlir_file.write(CANONICALIZATION)
             ops_mlir_file.write("}\n")
 
     print("Skipped ops num: " + str(len(skipped_op_list)))
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 36561d4e71da8b669f1e06b0240a4d4b3b2ca92e..f632c9a9dba504d209946e494e55eb970e727629 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -43,7 +43,8 @@ precision_type_converter = {
     "float64": "FLOAT64",
     "complex64": "COMPLEX64",
     "complex128": "COMPLEX128",
-    "bool": "BOOL"
+    "bool": "BOOL",
+    "Undefined": "UNK"
 }
 
 kernel_types_info_file = "./kernels.json"
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
index 78d59c2aef10be6db99c7947e8dc238e5463fb47..0680e87b38b3f6c29e7f813474d947598912437d 100644
--- a/tools/infrt/get_compat_kernel_signature.py
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -16,6 +16,8 @@ import os
 import re
 import json
 
+skip_list = []
+
 
 def parse_compat_registry(kernel_info):
     name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
@@ -42,6 +44,8 @@ def get_compat_kernels_info():
             compat_files.remove(file_)
 
     for file_ in compat_files:
+        if file_ in skip_list:
+            continue
         with open("../../paddle/phi/ops/compat/" + file_) as in_file:
             txt = in_file.readlines()
             content = ""
@@ -54,8 +58,9 @@ def get_compat_kernels_info():
                     content += line
                 if (registry and ";" in line):
                     data = content.replace("\n", "").replace(
-                        " ", "").strip("return").strip(
-                            "KernelSignature(").strip("\);").replace("\"", "")
+                        " ",
+                        "").strip("return").strip("KernelSignature(").strip(
+                            "\);").replace("\"", "").replace("\\", "")
                     registry = False
                     name, registry_info = parse_compat_registry(data)
 
diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
index 3b9f4b7273500f23d67a3062a2d4ee367c0b473b..6b2586d40819b9e25eef823dff59687114664197 100644
--- a/tools/infrt/get_phi_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
 grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
-#step 3: merge all infos
+
+#step 3:get ir's attr_name.
+ir_attr_name_info_file=`mktemp`
+# phi_cpu attr
+all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+# phi_gpu attr
+all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+
+#step 4: merge all infos
 #  @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
 #  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
 #  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
@@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \
   --paddle_root_path ${PADDLE_ROOT} \
   --kernel_info_file $kernel_register_info_file \
   --infermeta_wrap_file ${temp_path}/wrap_info.txt \
+  --attr_info_file $ir_attr_name_info_file \
   --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 774f6cd6bf3648a0de7a34e01e893d212bce9770..85ad585cdefa9cbb4ac8d029e699af4d5ffaeaf7 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -37,6 +37,8 @@ def parse_args():
         type=str,
         required=True,
         help="inferMeta wrap info file.")
+    parser.add_argument(
+        "--attr_info_file", type=str, required=True, help="attr info file.")
     parser.add_argument(
         "--generate_file",
         type=str,
@@ -59,6 +61,23 @@ def get_kernel_info(file_path):
     return [l.strip() for l in cont]
 
 
+def get_attr_info(file_path):
+    """
+    phi_gpu.argsort.float64.any $axisBool$descending
+    """
+    ret = {}
+    with open(file_path, 'r') as f:
+        cont = f.readlines()
+        for l in cont:
+            datas = l.strip().split(' ')
+            if len(datas) == 2:
+                attrs = datas[1].split('$')
+                ret[datas[0]] = attrs[1:]
+            else:
+                ret[datas[0]] = None
+    return ret
+
+
 def merge(infer_meta_data, kernel_data, wrap_data):
     meta_map = {}
     for api in infer_meta_data:
@@ -114,14 +133,14 @@ namespace kernel {
 
 def gen_context(val):
     if val == "CPU":
-        return "phi::CPUContext"
-    # elif val == "GPU":
-    #     return "phi::GPUContext"
+        return "phi::CPUContext", "phi_cpu"
+    elif val == "GPU":
+        return "phi::GPUContext", "phi_gpu"
     # elif val == "XPU":
-    #     return "phi::XPUContext"
+    #     return "phi::XPUContext", "phi_xpu"
     else:
         # raise Exception(f"Unknown context type {val}")
-        return ""
+        return "", ""
 
 
 def gen_layout(val):
@@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]):
     return ir_dtypes, origin_dtypes
 
 
-# TODO(wilber): Now only process CPUContext.
-def gen_register_info(resources: List[List[str]]):
+# Note: Now only process CPUContext and GPUContext.
+
+
+def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
     """
-    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta']
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
     """
-    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
-    for item in resources:
-        # The output string is polluted by C++ macros, here the \ is removed
-        update_item = [v.strip('\\') for v in item]
+    ctx_name, ir_ctx_name = gen_context(item[1])
+    if (ctx_name == ""):
+        return ""
+    item[2] = gen_layout(item[2])
+    ir_dtypes, origin_dtypes = gen_dtype(item[4:-1])
+    infer_shape_func = "&phi::" + item[-1]
 
-        ctx_name = gen_context(update_item[1])
-        if (ctx_name == ""):
-            continue
-        update_item[2] = gen_layout(update_item[2])
-        ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1])
-        infer_shape_func = "&phi::" + update_item[-1]
+    res = ""
 
-        if update_item[-1] == "unknown":
-            # TODO(wilber): handle the unknown inferShape func.
-            continue
+    if item[-1] == "unknown":
+        # TODO(wilber): handle the unknown inferShape func.
+        return ""
+
+    for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
+        kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype)
+        ir_name = ir_ctx_name + '.' + item[0].lower(
+        ) + '.' + ir_dtype + '.' + item[2].lower()
+        if ir_name in attr_data.keys() and attr_data[ir_name] is not None:
+            attr_names = ', '.join(
+                ["\"" + a + "\"" for a in attr_data[ir_name]])
+            res += f"""
+registry->AddKernelWithAttrs("{ir_name}","""
+
+            res += f"""
+    std::bind(&KernelLauncherFunc<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>,
+              KernelLauncher<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>(),
+              std::placeholders::_1),
+    {{{attr_names}}});
+"""
 
-        for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
-            kernel_func = gen_kernel_func(update_item[3], ctx_name,
-                                          origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower(
-            ) + '.' + ir_dtype + '.' + update_item[2].lower()
+        else:
             res += f"""
-  registry->AddKernel("{ir_name}","""
+registry->AddKernel("{ir_name}","""
 
             res += f"""
     std::bind(&KernelLauncherFunc<decltype({kernel_func}),
@@ -236,18 +274,54 @@ def gen_register_info(resources: List[List[str]]):
               std::placeholders::_1));
 """
 
+    return res
+
+
+def gen_register_info(resources: List[List[str]],
+                      attr_data: Dict[str, List[str]]):
+    """
+    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
+    """
+    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
+
+    # register cpu kernels.
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "CPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+
+    # register gpu kernels.
+    res += "\n#ifdef INFRT_WITH_GPU"
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "GPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+    res += "#endif // INFRT_WITH_GPU"
+
     res += "\n}"
     return res
 
 
 def gen_phi_kernel_register_code(resources: List[List[str]],
+                                 attr_data: Dict[str, List[str]],
                                  src_file_path: str):
     source_file = open(src_file_path, 'w')
     source_file.write(gen_warn_info())
     source_file.write(gen_include_headers())
     namespace = gen_namespace()
     source_file.write(namespace[0])
-    source_file.write(gen_register_info(resources))
+    source_file.write(gen_register_info(resources, attr_data))
     source_file.write(namespace[1])
     source_file.close()
 
@@ -257,5 +331,6 @@ if __name__ == "__main__":
     infer_meta_data = get_api_yaml_info(args.paddle_root_path)
     kernel_data = get_kernel_info(args.kernel_info_file)
     info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
+    attr_data = get_attr_info(args.attr_info_file)
     out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
-    gen_phi_kernel_register_code(out, args.generate_file)
+    gen_phi_kernel_register_code(out, attr_data, args.generate_file)
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
new file mode 100644
index 0000000000000000000000000000000000000000..576f0e5d238abc9968236a9d2575ed14430eda05
--- /dev/null
+++ b/tools/windows/check_change_of_unittest.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set +x
+export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+GITHUB_API_TOKEN=$GITHUB_API_TOKEN
+GIT_PR_ID=$AGILE_PULL_ID
+BRANCH=$BRANCH
+if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
+    exit 0 
+fi
+
+unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
+if [ "$unittest_spec_diff" != "" ]; then
+    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
+    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+    if [ "${APPROVALS}" == "FALSE" ]; then
+        echo "************************************"
+        echo -e "It is forbidden to disable or delete the unit-test.\n"
+        echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."
+        echo -e "Then you must have one RD (kolinwei(recommended), chalsliu, XieYunshen or zhouwei25) approval for the deletion of unit-test. \n"
+        echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
+        echo -e "Following unit-tests are deleted in this PR: \n${unittest_spec_diff} \n"
+        echo "************************************"
+        exit 6
+    fi
+fi
+set -x