Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

c1c6b869 · phlrain · 733d3109 · 23c036d6 · c1c6b869 · c1c6b869
176 changed file
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -1256,7 +1256,7 @@ if __name__ == "__main__":
            # Node Definition Generation
            definition_declaration_pair = GenerateForwardDefinition(
                fwd_api_name, bwd_api_name, forward_inputs_position_map,
-                forward_outputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, orig_forward_attrs_list,
                backward_fwd_input_map, backward_grad_input_map,
                backward_grad_output_map, backward_attrs_list, optional_inputs,
                intermediate_outputs)
@@ -1268,7 +1268,7 @@ if __name__ == "__main__":
            # For python-level API dispatch
            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
                                      forward_outputs_position_map,
-                                      forward_attrs_list)
+                                      orig_forward_attrs_list)
        if len(namespace) > 0:
            forward_definition_str += f"""namespace {namespace} {{

--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
    return;
  }
+  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
+    return;
+  }
  // NOTE(yy): TransDataDevice should wait for computation of input.
  if (!platform::is_cuda_pinned_place(in.place())) {
    platform::DeviceContextPool::Instance().Get(in.place())->Wait();

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
  std::unordered_map<std::string, std::pair<VarDesc *, int>>
      name_to_desc_block_id;
+  block_id_ = block.ID();
  const BlockDesc *block_var_visible = &block;
  while (block_var_visible != nullptr) {
    for (auto *var : block_var_visible->AllVars()) {

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -230,6 +230,7 @@ class Graph {
    auto *x =
        AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -245,6 +246,7 @@ class Graph {
                     "The OpDesc used to create operator node is null."));
    auto *x = AddNode(new ir::Node(op_desc));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -263,6 +265,7 @@ class Graph {
        num_node_created_);
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -276,6 +279,7 @@ class Graph {
    }
    auto *x = AddNode(new ir::Node(name, type, block_id_));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -125,6 +125,7 @@ class Node {
  // Only use this for auto parallel.
  // A node does not have original desc if the return is zero.
  uint64_t OriginalDescId() const { return original_desc_id_; }
+  int GraphId() const { return graph_id_; }
  bool IsOp() const { return type_ == Type::kOperation; }
  bool IsVar() const { return type_ == Type::kVariable; }
@@ -246,10 +247,12 @@ class Node {
  // Store the original id of var desc or op desc.
  // Only use this for auto parallel.
  uint64_t original_desc_id_{0};
+  int graph_id_{-1};
 private:
  // ID can only set by a Graph.
  void SetId(int id) { id_ = id; }
+  void SetGraphId(int graph_id) { graph_id_ = graph_id; }
  // desc_order can only set by a Graph when constructing a Graph from a
  // BlockDesc.

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
  if (platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() ||
       !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
@@ -1470,17 +1471,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_XPU_KP
-  bool use_xpu_kp_kernel_rt =
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
-      FLAGS_run_kp_kernel &&
+    bool use_xpu_kp_kernel_rt =
-      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+        FLAGS_run_kp_kernel &&
-  bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
-      paddle::platform::is_in_xpu_kpwhite_list(type_);
+    bool use_xpu_kp_kernel_debug =
-  if (platform::is_xpu_place(expected_kernel_key.place_) &&
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    if (use_xpu_kp_kernel_rt) {
-    expected_kernel_key.library_type_ = LibraryType::kKP;
+      VLOG(3) << "xpu_kp using rt mode ";
-    kernel_iter = kernels.find(expected_kernel_key);
+    }
-    VLOG(3) << "using XPU KP kernel: " << type_
+    if (use_xpu_kp_kernel_debug) {
-            << ", using_kernel_key:" << expected_kernel_key;
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << type_
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    bool is_xpu_unsupport =
+        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(type_));
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << type_
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
  }
 #endif

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
-    int32_t size;
+    int32_t size = -1;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
+                                           "Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
+                                   "Tensor desc size should >= 0"));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
 #elif defined(PADDLE_WITH_ASCEND_CL)
  auto unsupported_ops_npu_fp16 = std::get<2>(
      OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                unsupported_ops_xpu_bf16.end());
+#elif defined(PADDLE_WITH_MLU)
+  auto unsupported_ops_mlu_fp16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
+                                unsupported_ops_mlu_fp16.end());
+  auto unsupported_ops_mlu_bf16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
+  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
+                                unsupported_ops_mlu_bf16.end());
 #endif
  VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
          << unsupported_fp16_ops_->size() << " "
@@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
      paddle::platform::is_npu_place(place) ||
      paddle::platform::is_npu_pinned_place(place)) {
    // CudaPinndePlace is added for varbase created by dataloader

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
  auto& kernels = kernels_iter->second;
  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() || is_xpu_unsupport)) {
    VLOG(3) << "missing XPU kernel: " << op.Type()
@@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
 #ifdef PADDLE_WITH_XPU_KP
-  expected_kernel_key.place_ = platform::XPUPlace();
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
-  bool use_xpu_kp_kernel_rt =
+    bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
+        FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+        paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
+    bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+        paddle::platform::is_in_xpu_kpwhite_list(op.Type());
-  if (use_xpu_kp_kernel_rt) {
+    if (use_xpu_kp_kernel_rt) {
-    VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "xpu_kp using rt mode ";
-  }
+    }
-  if (use_xpu_kp_kernel_debug) {
+    if (use_xpu_kp_kernel_debug) {
-    VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "xpu_kp using debug mode ";
-  }
+    }
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    if (is_xpu_kp_support) {
-    expected_kernel_key.place_ = platform::XPUPlace();
+      expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
-    kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << op.Type()
-    VLOG(3) << "using XPU KP kernel: " << op.Type()
+              << ", using_kernel_key:" << expected_kernel_key;
-            << ", using_kernel_key:" << expected_kernel_key;
+    }
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
  }
 #endif

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext(
  }
  for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
    if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
      if (attrs.find(attr_names[i]) !=
          attrs.end()) {  // shape is in the attribute

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
 REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
 REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
+                       HardShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
+                       SoftShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                       TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
 /* ==========================    sigmoid register  =============================
 */
@@ -1626,22 +1633,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(elu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ========================    logit  register     ============================

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu)
 USE_PHI_FUNCTOR(ThresholdedRelu)
 USE_PHI_FUNCTOR(LeakyRelu)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+USE_PHI_FUNCTOR(HardShrink)
+USE_PHI_FUNCTOR(SoftShrink)
+USE_PHI_FUNCTOR(TanhShrink)
+USE_PHI_FUNCTOR(Silu)
+USE_PHI_FUNCTOR(ELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+template <typename T>
+using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
@@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-// silu(x) = x / (1 + exp(-x))
-template <typename T>
-struct SiluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    out.device(d) = x * temp;
-  }
-};
-// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
-template <typename T>
-struct SiluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
-    auto temp2 = x * (-x).exp();                  // x*e^(-x)
-    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
-                           (static_cast<T>(1) + (temp2 / temp1)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 || temp2).template cast<T>();
-  }
-};
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
-  }
-};
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 1: alpha >= 0
-    // dx = dout, if out > 0
-    // dx = dout * (out + alpha), if out <= 0
-    dx.device(d) = (out > static_cast<T>(0))
-                       .select(dout, dout * (out + static_cast<T>(alpha)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 2: alpha < 0
-    // dx = dout, if x > 0
-    // dx = dout * (out + alpha), if x <=0
-    dx.device(d) = (x > static_cast<T>(0))
-                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename DeviceContext, typename T>
 class ELUGradKernel : public framework::OpKernel<T> {
 public:
@@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
-      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
@@ -2151,26 +1951,22 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace operators
 }  // namespace paddle
-#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
+#define FOR_EACH_ACTIVATION_OP(__macro)                                      \
-  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
+  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
-  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
-  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
-  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
+  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                     \
-  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
+  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                         \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
+  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                      \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
-  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
-  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
+  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                     \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
+          HardSigmoidGradFunctor);                                           \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                         \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
-          HardSigmoidGradFunctor);                                            \
-  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
  __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x / (one + exp(-x)));
-  }
-};
-template <typename T>
-struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = one / (one + exp(-x));
-    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  // softshrink(x) = x - lambda, if x > lambda;
-  //                 x + lambda, if x < -lambda;
-  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T x) const {
-    T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return temp1 * (x - l) + temp2 * (x + l);
-  }
-};
-template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T l = static_cast<T>(lambda);
-    return (x >= -l && x <= l) ? zero : dout;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x - tanh(x));
-  }
-};
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * tanh(x) * tanh(x));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : dout;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
@@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // elu(x) = x, if x > 0
-  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
-    CT res = x > zero ? x : temp;
-    return static_cast<T>(res);
-  }
-};
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // case 1: alpha >= 0
-  // dx = dout, if out > 0
-  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType a = static_cast<MPType>(alpha);
-    MPType out_pos = static_cast<MPType>(out > zero);
-    MPType out_neg = static_cast<MPType>(out <= zero);
-    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // case 2: alpha < 0
-  // dx = dout, if x > 0
-  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType x_pos = static_cast<MPType>(x > zero);
-    MPType x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename DeviceContext, typename T>
-class ELUGradCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    const float alpha = ctx.Attr<float>("alpha");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {d_out, out};
-    std::vector<framework::Tensor*> outs = {d_x};
-    if (alpha > 0) {
-      CudaELUGradFunctor<T> functor;
-      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    } else {
-      CudaELUGradNegativeAlphaFunctor<T> functor;
-      functor.alpha = alpha;
-      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
 template <typename T>
 struct CudaCELUFunctor : public BaseActivationFunctor<T> {
  using CT = typename details::MPTypeTrait<T>::Type;
@@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh)
 USE_PHI_FUNCTOR(CudaBRelu)
 USE_PHI_FUNCTOR(CudaLeakyRelu)
 USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaHardShrink)
+USE_PHI_FUNCTOR(CudaSoftShrink)
+USE_PHI_FUNCTOR(CudaTanhShrink)
+USE_PHI_FUNCTOR(CudaSilu)
+USE_PHI_FUNCTOR(CudaELU)
+template <typename T>
+using CudaELUGradNegativeAlphaFunctor =
+    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
 }  // namespace operators
 }  // namespace paddle
@@ -1158,26 +937,6 @@ namespace plat = paddle::platform;
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                    ops::grad_functor<plat::bfloat16>>);
-/* ======================== elu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                   ops::CudaELUFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaELUFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaELUFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ======================== celu register  ============================ */
@@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
          CudaLogSigmoidGradFunctor);                                         \
  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \

--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 namespace paddle {
@@ -20,6 +21,8 @@ namespace operators {
 template <typename T>
 class MLUBatchNormOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto &place = ctx.GetPlace();
@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
    // alloc memory
    y->mutable_data<T>(place);
-    mean_out->mutable_data<T>(place);
+    mean_out->mutable_data<MPDType>(place);
-    variance_out->mutable_data<T>(place);
+    variance_out->mutable_data<MPDType>(place);
-    saved_mean->mutable_data<T>(place);
+    saved_mean->mutable_data<MPDType>(place);
-    saved_variance->mutable_data<T>(place);
+    saved_variance->mutable_data<MPDType>(place);
    Tensor transformed_x;
    Tensor transformed_y;
@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 template <typename T>
 class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<Tensor>("X");
@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
    auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
    auto d_x_tmp =
        ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp =
+    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
+        scale->dims(), dev_ctx);
    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
    if (d_x == nullptr) {
      d_x = &d_x_tmp;
@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
    const auto &place = ctx.GetPlace();
    d_x->mutable_data<T>(place);
-    d_scale->mutable_data<T>(place);
+    d_scale->mutable_data<MPDType>(place);
-    d_bias->mutable_data<T>(place);
+    d_bias->mutable_data<MPDType>(place);
    use_global_stats = is_test || use_global_stats;

--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -21,14 +23,6 @@ namespace operators {
 class CumprodOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
 };
 class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                  ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
+                  CumprodInferShapeFunctor);
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -15,9 +15,14 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherOp should not be null."));
-    auto index_dims = ctx->GetInputDim("Index");
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto input_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Axis") || axis == 0) {
-      // if HasInput("Axis"), we can not obtain correct shape of output
-      int batch_size = index_dims[0];
-      framework::DDim output_dims(input_dim);
-      output_dims[0] = batch_size;
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int index_size = index_dims[0];
-      std::vector<int> out_dim_vec;
-      for (int i = 0; i < axis; i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      out_dim_vec.push_back(index_size);
-      for (int i = axis + 1; i < input_dim.size(); i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      auto output_dims = phi::make_ddim(out_dim_vec);
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
+                            PD_INFER_META(phi::GatherInferMeta));
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                  ops::GatherGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherGradOpMaker<paddle::imperative::OpBase>,
+                  GatherInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInferer);
+                  ops::GatherGradNoNeedBufferVarInferer,
+                  GatherGradInferShapeFunctor);
 REGISTER_OP_VERSION(gather)
    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",

--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_VERSION(grid_sampler)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-template <typename T>
-static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
-                                                  int sW, int H, int W,
-                                                  T delta) {
-  if (in_bounds(h, w, H, W)) {
-    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T _unnormalize(T coord, int size,
-                                                 bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
-  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
-}
-template <typename T>
-static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
-                                                    int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<T>(0);
-  }
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = fabs(in - min);
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T compute_positions(T coord, int size,
-                                                      PaddingMode padding_mode,
-                                                      bool align_corners) {
-  coord = _unnormalize<T>(coord, size, align_corners);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes(coord, size - 1);
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_indexes(coord, -1, 2 * size - 1);
-    }
-    coord = clip_indexes(coord, size - 1);
-  }
-  return coord;
-}
-template <typename T>
-static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
-                                                           bool align_corners,
-                                                           T* grad_in) {
-  if (align_corners) {
-    *grad_in = static_cast<T>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    *grad_in = static_cast<T>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
-                                                           T* grad_in) {
-  if (in <= static_cast<T>(0)) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  } else {
-    T max = static_cast<T>(clip_limit - 1);
-    if (in >= max) {
-      *grad_in = static_cast<T>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<T>(1);
-      return in;
-    }
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T
-reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  }
-  int grad_in_mult_;
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<T>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<T>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<T>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T
-compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
-                            bool align_corners, T* grad_in) {
-  T grad_clip, grad_refl;
-  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
-    } else {
-      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
-    }
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_refl * grad_clip;
-  }
-  return coord;
-}
-template <typename T>
-__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
-                                        int out_h, int out_w, int in_h,
-                                        int in_w, const T* input, const T* grid,
-                                        T* output, const Mode mode,
-                                        const PaddingMode padding_mode,
-                                        bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int out_sN = out_c * out_h * out_w;
-  int out_sC = out_h * out_w;
-  int out_sH = out_w;
-  int out_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-    ix = compute_positions(ix, in_w, padding_mode, align_corners);
-    iy = compute_positions(iy, in_h, padding_mode, align_corners);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<T>(0);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
-          *out_ptr_NCHW =
-              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<T>(0);
-        }
-      }
-    }
-  }
-}
-template <typename T>
-class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-            << "; out_w: " << out_w;
-    auto* output = ctx.Output<Tensor>("Output");
-    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
-            << "; " << output->dims()[2] << "; " << output->dims()[3];
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sample_cuda_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
-        grid->data<T>(), output_data, mode, padding_mode, align_corners);
-  }
-};
-template <typename T>
-__global__ void grid_sampler_cuda_backward_kernel(
-    const int nthreads, const T* grad_output, const T* input, const T* grid,
-    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
-    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
-    bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int gOut_sN = out_c * out_h * out_w;
-  int gOut_sC = out_h * out_w;
-  int gOut_sH = out_w;
-  int gOut_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-    T gix_mult, giy_mult;
-    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
-                                     &gix_mult);
-    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
-                                     &giy_mult);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-      T gix = static_cast<T>(0), giy = static_cast<T>(0);
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      int inp_offset_NC = n * inp_sN;
-      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
-               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        T gOut = grad_output[gOut_offset];
-        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
-                   nw * gOut);
-        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
-                   ne * gOut);
-        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
-                   sw * gOut);
-        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
-                   se * gOut);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      for (int c = 0; c < out_c;
-           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
-                   in_w, grad_output[gOut_offset]);
-      }
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = static_cast<T>(0);
-        gGrid_ptr_NHW[1] = static_cast<T>(0);
-      }
-    }
-  }
-}
-template <typename T>
-class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-        input_grad, static_cast<T>(0));
-    T* grid_grad_data = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sampler_cuda_backward_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
-        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
-        padding_mode, align_corners);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
-                        ops::GridSampleOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
-                        ops::GridSampleGradOpCUDAKernel<float>,
-                        ops::GridSampleGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 #include "paddle/fluid/operators/index_select_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of IndexSelectOp should not be null."));
-    auto input_dim = ctx->GetInputDim("X");
-    auto index_dim = ctx->GetInputDim("Index");
-    auto dim = ctx->Attrs().Get<int>("dim");
-    PADDLE_ENFORCE_EQ(
-        dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            input_dim.size(), input_dim.size() - 1, dim));
-    PADDLE_ENFORCE_EQ(
-        index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "The 'shape' of Input(Index) must be 1-D tensor. "
-                  "But received: the 'shape' of Input(Index) is [%s], "
-                  "the dimension of Input(Index) is [%d].",
-                  index_dim, index_dim.size()));
-    PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
-                      platform::errors::InvalidArgument(
-                          "The length of Input(Index) can't be 0."));
-    auto output_dim = phi::vectorize(input_dim);
-    if (dim < 0) {
-      dim += input_dim.size();
-    }
-    output_dim[dim] = index_dim[0];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSelectInferMeta));
 REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
                  ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
+                  IndexSelectInferShapeFunctor);
 REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
                  ops::IndexSelectGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_select,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/index_select_op.cu
+++ b/paddle/fluid/operators/index_select_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input, T* output,
-                                         const IndexT* index, int64_t N,
-                                         int64_t stride, int64_t size,
-                                         int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  output[idx] = input[input_idx];
-}
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index, int64_t nums,
-                                              int64_t N, int64_t stride,
-                                              int64_t size, int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-template <typename DeviceContext, typename T>
-class IndexSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* out = context.Output<LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in->dims();
-    auto output_dim = out->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = out->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* output_grad_data = output_grad->data<T>();
-    auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in_grad->dims();
-    auto output_dim = output_grad->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    int64_t numel = in_grad->numel();
-    int64_t index_nums = index->numel();
-    int64_t out_nums = output_grad->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_grad_cuda_kernel<T, int64_t><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_select,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
  output->Resize(output_dim);
 }
-template <typename DeviceContext, typename T>
-class IndexSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto inputs = *context.Input<framework::LoDTensor>("X");
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* output = context.Output<framework::LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += inputs.dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
-                                              dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
-                                                  output, dim);
-    }
-  }
-};
 template <typename DeviceContext, typename T, class Enable = void>
 struct IndexSelectAdd {
  void operator()(const framework::ExecutionContext& ctx, int slice_size,
@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
  x_grad->Resize(output_dim);
 }
-template <typename DeviceContext, typename T>
-class IndexSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* out_grad =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_grad->dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
-                                                  x_grad, dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
-                                                      *index, x_grad, dim);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -14,10 +14,13 @@
 #include <cmath>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", input_dim);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
+                            PD_INFER_META(phi::ValueCompareInferMeta));
 REGISTER_OPERATOR(
    isclose, ops::IscloseOp, ops::IscloseOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::IscloseOpVarTypeInference);
+    ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -11,7 +11,9 @@
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -21,44 +23,6 @@ using framework::Tensor;
 class KLDivLossOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) rank and Input(Target) rank should be "
-                          "same, but received X rank(%d) != Target rank(%d)",
-                          dim_x.size(), dim_target.size()));
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            dim_x[i], dim_target[i],
-            platform::errors::InvalidArgument(
-                "Input(X) and Input(Target) should in same shape. but received "
-                "X dimension[%d](%d) != Target dimension[%d](%d)",
-                i, dim_x[i], i, dim_target[i]));
-      }
-    }
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-    auto reduction_valid = "mean" == reduction || "sum" == reduction ||
-                           "batchmean" == reduction || "none" == reduction;
-    PADDLE_ENFORCE_EQ(
-        reduction_valid, true,
-        platform::errors::InvalidArgument(
-            "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
@@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
+                            PD_INFER_META(phi::KLDivInferMeta));
 REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                  ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
+                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
+                  KLDivInferShapeFunctor);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                  ops::KLDivLossGradNoNeedBufferVarInferer);
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
  const auto W = udims[udims.size() - 1];
  auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
  platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
+                                               L_dataptr);
  x_for_range(tril_computer);
-  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
+  phi::funcs::TrilTriuCompute<T> triu_computer(
-                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
  x_for_range(triu_computer);
  // set L's diagonal 1
@@ -532,15 +533,15 @@ class LUGradKernel : public framework::OpKernel<T> {
    auto phil_rank = LmHdims.size();
    auto phiu_rank = UmHdims.size();
    platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
-    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
+    phi::funcs::TrilTriuCompute<T> tril_computer(
-                                     LmHdims[phil_rank - 2],
+        phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
-                                     LmHdims[phil_rank - 1], phi_L.data<T>());
+        LmHdims[phil_rank - 1], phi_L.data<T>());
    l_for_range(tril_computer);
    platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
-    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
+    phi::funcs::TrilTriuCompute<T> triu_computer(
-                                     UmHdims[phiu_rank - 2],
+        phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
-                                     UmHdims[phiu_rank - 1], phi_U.data<T>());
+        UmHdims[phiu_rank - 1], phi_U.data<T>());
    u_for_range(triu_computer);
    Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
        const auto W = phidims[phidims.size() - 1];
        platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                      phi_complement.numel());
-        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
+        phi::funcs::TrilTriuCompute<T> tril_computer(
-                                         W, phi_complement_l.data<T>());
+            phi_complement.data<T>(), -1, true, H, W,
+            phi_complement_l.data<T>());
        x_for_range(tril_computer);
        Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
      const auto W = phidims[phidims.size() - 1];
      platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                    phi_complement.numel());
-      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
+      phi::funcs::TrilTriuCompute<T> triu_computer(
-                                       phi_complement_u.data<T>());
+          phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
      x_for_range(triu_computer);
      Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);

--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -16,7 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 namespace paddle {
 namespace operators {
@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
    auto W = ldims[ldims.size() - 1];
    auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
    platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
+                                                 L_dataptr);
    l_for_range(tril_computer);
    const auto udims = du->dims();
@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
    W = udims[udims.size() - 1];
    auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
    platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
+                                                 U_dataptr);
    u_for_range(triu_computer);
    auto xdims = dx->dims();

--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/multiplex_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                  ops::MultiplexGradMaker<paddle::framework::OpDesc>,
                  ops::MultiplexGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-        idx = i;
-      }
-    }
-    if (idx == -1UL) return;
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-        idx = i;
-      }
-    }
-    if (idx == -1UL) return;
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
-REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(
    qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
  return std::make_tuple(compute_q, reduced);
 }
-template <typename T>
-class QrCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool compute_q;
-    bool reduced_mode;
-    const Tensor& x = *context.Input<Tensor>("X");
-    Tensor& q = *context.Output<Tensor>("Q");
-    Tensor& r = *context.Output<Tensor>("R");
-    std::string mode = context.Attr<std::string>("mode");
-    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
-    auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
-    auto x_dims = x.dims();
-    int x_rank = x_dims.size();
-    int m = x_dims[x_rank - 2];
-    int n = x_dims[x_rank - 1];
-    int min_mn = std::min(m, n);
-    int k = reduced_mode ? min_mn : m;
-    int batch_size = numel / (m * n);
-    int x_stride = m * n;
-    int q_stride = m * k;
-    int r_stride = k * n;
-    auto* x_data = x.data<phi::dtype::Real<T>>();
-    T* q_data = nullptr;
-    if (compute_q) {
-      q_data = q.mutable_data<phi::dtype::Real<T>>(
-          context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-      memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-    }
-    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
-        context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    // Implement QR by calling Eigen
-    for (int i = 0; i < batch_size; ++i) {
-      const T* x_matrix_ptr = x_data + i * x_stride;
-      T* r_matrix_ptr = r_data + i * r_stride;
-      using EigenDynamicMatrix =
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
-      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
-      if (reduced_mode) {
-        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
-        auto r_matrix_view =
-            qr_top_matrix.template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      } else {
-        auto r_matrix_view =
-            qr.matrixQR().template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      }
-      if (compute_q) {
-        T* q_matrix_ptr = q_data + i * q_stride;
-        if (reduced_mode) {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        } else {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        }
-      }
-    }
-  }
-};
 template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                  ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                  ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_VERSION(roi_align)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-    int output_grad_size = out_grad->numel();
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/roll_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                  ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                  ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
 REGISTER_OP_VERSION(roll)
    .AddCheckpoint(

--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                  ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
                  ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
-                              plat::float16>);
--- a/paddle/fluid/operators/tril_triu_op.h
+++ b/paddle/fluid/operators/tril_triu_op.h
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TrilTriuCompute {
- public:
-  HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
-                             const int64_t H, const int64_t W, T* out)
-      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
-  HOSTDEVICE void operator()(int64_t idx) {
-    const int64_t row = (idx / W_) % H_;
-    const int64_t col = idx % W_;
-    const bool mask =
-        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
-    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
-  }
- private:
-  const T* in_;
-  const int diagonal_;
-  const bool lower_;
-  const int64_t H_;
-  const int64_t W_;
-  T* out_;
-};
-template <typename DeviceContext, typename T>
-class TrilTriuOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-    const auto& dims = x->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-    paddle::operators::TrilTriuCompute<T> tril_triu_computer(
-        x_data, diagonal, lower, H, W, out_data);
-    for_range(tril_triu_computer);
-  }
-};
-template <typename DeviceContext, typename T>
-class TrilTriuGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-    const auto& dims = d_out->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_out->numel()));
-    paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
-        dout_data, diagonal, lower, H, W, dx_data);
-    for_range(tril_triu_grad_computer);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
      .def("var", &Node::Var, return_value_policy::reference)
      .def("op", &Node::Op, return_value_policy::reference)
      .def("id", &Node::id)
+      .def("graph_id", &Node::GraphId)
      .def("original_desc_id", &Node::OriginalDescId)
      .def("is_op", &Node::IsOp)
      .def("is_var", &Node::IsVar)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -114,6 +114,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) {
  // stored in this static instance to avoid illegal memory access.
  m.def("clear_kernel_factory",
        []() { phi::KernelFactory::Instance().kernels().clear(); });
+  m.def("clear_device_manager", []() {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    phi::DeviceManager::Clear();
+#endif
+  });
  // NOTE(zjl): ctest would load environment variables at the beginning even
  // though we have not `import paddle.fluid as fluid`. So we add this API

--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
    return DenseTensorType::get(
        parser.getContext(), *targetType, *precisionType, *layoutType);
  }
+  if (keyword == "dense_tensor_map") {
+    return DenseTensorMapType::get(parser.getContext());
+  }
  // Todo: parse other type
  return mlir::Type();
 }
@@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type,
  }
  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
-  if (type.isa<infrt::DenseTensorType>()) {
+  if (type.isa<DenseTensorType>()) {
    auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
    os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
       << dense_tensor_type.getPrecision() << ", "
@@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type,
    return;
  }
+  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
+  if (type.isa<DenseTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
  llvm_unreachable("unknown infrt type.");
 }

--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -18,7 +18,7 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 int main(int argc, char** argv) {
  static llvm::cl::opt<std::string> input_file(

--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
  return platform_manager;
 }
+void DeviceManager::Clear() {
+  Instance().device_map_.clear();
+  Instance().device_impl_map_.clear();
+}
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
  std::vector<std::string> libraries;
  std::regex express(".*\\.so");

--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -158,6 +158,8 @@ class DeviceManager {
  static std::vector<size_t> GetDeviceList(const std::string& device_type);
+  static void Clear();
 private:
  DISABLE_COPY_AND_ASSIGN(DeviceManager);
  DeviceManager() {}

--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -24,6 +24,10 @@ limitations under the License. */
 namespace phi {
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                        const MetaTensor& y,
                                        const MetaTensor& weight,

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x,
  out->set_dtype(DataType::BOOL);
 }
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_target = label.dims();
+  PADDLE_ENFORCE_EQ(dim_x.size(),
+                    dim_target.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) rank and Input(Target) rank should be "
+                        "same, but received X rank(%d) != Target rank(%d)",
+                        dim_x.size(),
+                        dim_target.size()));
+  for (int i = 0; i < dim_x.size(); i++) {
+    if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[i],
+          dim_target[i],
+          phi::errors::InvalidArgument(
+              "Input(X) and Input(Target) should in same shape. but received "
+              "X dimension[%d](%d) != Target dimension[%d](%d)",
+              i,
+              dim_x[i],
+              i,
+              dim_target[i]));
+    }
+  }
+  auto reduction_valid = "mean" == reduction || "sum" == reduction ||
+                         "batchmean" == reduction || "none" == reduction;
+  PADDLE_ENFORCE_EQ(
+      reduction_valid,
+      true,
+      phi::errors::InvalidArgument(
+          "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
+  if ("none" == reduction) {
+    out->set_dims(dim_x);
+  } else {
+    out->set_dims({1});
+  }
+  out->set_dtype(x.dtype());
+}
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
  out->share_meta(x);
 }
@@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
  out->share_lod(x);
 }
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out) {
+  auto index_dims = index.dims();
+  if (index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(
+        index_dims[1],
+        1,
+        phi::errors::InvalidArgument(
+            "The last dim of index should be 1 when it is 2D, but we get %d",
+            index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The index should be 1D, when it is not 2D, but we get %d",
+            index_dims.size()));
+  }
+  auto input_dim = x.dims();
+  auto axis_v = axis.to<int>();
+  if (axis.FromTensor() || axis_v == 0) {
+    // if axis.FromTensor(), we can not obtain correct shape of output
+    int batch_size = index_dims[0];
+    phi::DDim output_dims(input_dim);
+    output_dims[0] = batch_size;
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  } else {
+    int index_size = index_dims[0];
+    std::vector<int> out_dim_vec;
+    for (int i = 0; i < axis_v; i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    out_dim_vec.push_back(index_size);
+    for (int i = axis_v + 1; i < input_dim.size(); i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    auto output_dims = phi::make_ddim(out_dim_vec);
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  }
+}
 void GatherNdInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       MetaTensor* out) {
@@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x,
  out->share_lod(y);
 }
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output) {
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  PADDLE_ENFORCE_EQ(
+      dim < input_dim.size() && dim >= (0 - input_dim.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+          input_dim.size(),
+          input_dim.size() - 1,
+          dim));
+  PADDLE_ENFORCE_EQ(
+      index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
+      true,
+      phi::errors::InvalidArgument(
+          "The 'shape' of Input(Index) must be 1-D tensor. "
+          "But received: the 'shape' of Input(Index) is [%s], "
+          "the dimension of Input(Index) is [%d].",
+          index_dim,
+          index_dim.size()));
+  PADDLE_ENFORCE_EQ(
+      index_dim[0] != 0,
+      true,
+      phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
+  auto output_dim = phi::vectorize(input_dim);
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  output_dim[dim] = index_dim[0];
+  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dtype(x.dtype());
+  output->set_layout(x.layout());
+  output->share_lod(x);
+}
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x,
  out->share_lod(y);
 }
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
 }  // namespace phi
 PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
@@ -28,12 +29,20 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AllValueCompareInferMeta(const MetaTensor& x,
                              const MetaTensor& y,
                              MetaTensor* out,
                              MetaConfig config = MetaConfig());
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 void BCELossInferMeta(const MetaTensor& input,
@@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                             int axis,
                             MetaTensor* out);
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out);
 void GatherNdInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       MetaTensor* out);
@@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x,
                          MetaTensor* out,
                          MetaConfig config = MetaConfig());
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output);
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x,
                              bool unitriangular,
                              MetaTensor* out);
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
 }  // namespace phi
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,23 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 void AdadeltaInferMeta(const MetaTensor& param,

--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,6 +27,8 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input,
  total_weight->set_dtype(input.dtype());
 }
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of RoisNum should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(X) in"
+                        "RoIAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(ROIs) "
+                                                 "in RoIAlignOp should be 2. "
+                                                 "But the rank of RoIs is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(ROIs) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoIAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoIAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoIAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -30,6 +30,8 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 //
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AccuracyInferMeta(const MetaTensor& out,
                       const MetaTensor& indice,
                       const MetaTensor& label,
@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input,
                         MetaTensor* total_weight,
                         MetaConfig config = MetaConfig());
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
  ReshapeInferMeta(x, shape, out, config);
 }
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
+  }
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
  auto in_dim = input.dims();
  out->set_dims(phi::make_ddim({in_dim.size()}));

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,6 +31,8 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void ArgMinMaxInferMeta(const MetaTensor& x,
                        int64_t axis,
@@ -164,6 +166,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                MetaTensor* out,
                                MetaConfig config = MetaConfig());
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
 void ShardIndexInferMeta(const MetaTensor& in,

--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -26,6 +26,23 @@ namespace phi {
                        const DenseTensor& dout,  \
                        DenseTensor* dx);
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx);
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
+  template <typename T, typename Context>                               \
+  void name##GradKernel(const Context& dev_ctx,                         \
+                        const DenseTensor& x,                           \
+                        const DenseTensor& dout,                        \
+                        float attr1,                                    \
+                        float attr2,                                    \
+                        DenseTensor* dx);
 #define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
  template <typename T, typename Context>           \
  void name##GradKernel(const Context& dev_ctx,     \
@@ -33,6 +50,14 @@ namespace phi {
                        const DenseTensor& dout,    \
                        DenseTensor* dx);
+#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
+  template <typename T, typename Context>                                \
+  void name##GradKernel(const Context& dev_ctx,                          \
+                        const DenseTensor& out,                          \
+                        const DenseTensor& dout,                         \
+                        float attr,                                      \
+                        DenseTensor* dx);
 template <typename T, typename Context>
 void ReluDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& out,
@@ -58,21 +83,6 @@ void TanhTripleGradKernel(const Context& dev_ctx,
                          DenseTensor* d_dout,
                          DenseTensor* d_ddx);
-template <typename T, typename Context>
-void BReluGradKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& dout,
-                     float t_min,
-                     float t_max,
-                     DenseTensor* dx);
-template <typename T, typename Context>
-void LeakyReluGradKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         const DenseTensor& dout,
-                         float alpha,
-                         DenseTensor* dx);
 template <typename T, typename Context>
 void LeakyReluDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
@@ -81,11 +91,21 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx,
                               DenseTensor* ddout);
 template <typename T, typename Context>
-void ThresholdedReluGradKernel(const Context& dev_ctx,
+void EluGradKernel(const Context& dev_ctx,
-                               const DenseTensor& x,
+                   const DenseTensor& x,
-                               const DenseTensor& dout,
+                   const DenseTensor& out,
-                               float threshold,
+                   const DenseTensor& dout,
-                               DenseTensor* dx);
+                   float alpha,
+                   DenseTensor* dx);
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
@@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
+    DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold)
+        DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda)
+            DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)
+                DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
 }  // namespace phi
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -24,6 +24,21 @@ namespace phi {
  void name##Kernel(                      \
      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
+  template <typename T, typename Context>                    \
+  void name##Kernel(const Context& dev_ctx,                  \
+                    const DenseTensor& x,                    \
+                    float attr,                              \
+                    DenseTensor* out);
+#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    float attr1,                                     \
+                    float attr2,                                     \
+                    DenseTensor* out);
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
 DECLARE_ACTIVATION_KERNEL(Acos)
@@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
-template <typename T, typename Context>
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
-void BReluKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float t_min,
-                 float t_max,
-                 DenseTensor* out);
-template <typename T, typename Context>
-void LeakyReluKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float alpha,
-                     DenseTensor* out);
-template <typename T, typename Context>
-void ThresholdedReluKernel(const Context& dev_ctx,
-                           const DenseTensor& x,
-                           float threshold,
-                           DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -21,101 +21,140 @@ limitations under the License. */
 namespace phi {
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
  template <typename T, typename Context>                           \
  void name##GradKernel(const Context& dev_ctx,                     \
                        const DenseTensor& x,                       \
                        const DenseTensor& dout,                    \
                        DenseTensor* dx) {                          \
-    functor_class<T> functor;                                       \
+    funcs::functor_class<T> functor;                                \
-    ActivationGradImpl<T, Context, functor_class<T>>(               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(        \
        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(      \
-    name, functor_class, attr)                          \
+    name, functor_class, attr)                               \
-  template <typename T, typename Context>               \
+  template <typename T, typename Context>                    \
-  void name##GradKernel(const Context& dev_ctx,         \
+  void name##GradKernel(const Context& dev_ctx,              \
-                        const DenseTensor& x,           \
+                        const DenseTensor& x,                \
-                        const DenseTensor& dout,        \
+                        const DenseTensor& dout,             \
-                        float attr,                     \
+                        float attr,                          \
-                        DenseTensor* dx) {              \
+                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                           \
+    funcs::functor_class<T> functor;                         \
-    auto attrs = functor.GetAttrs();                    \
+    auto attrs = functor.GetAttrs();                         \
-    *(attrs[0].second) = attr;                          \
+    *(attrs[0].second) = attr;                               \
-    ActivationGradImpl<T, Context, functor_class<T>>(   \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
-        dev_ctx, &x, nullptr, &dout, dx, functor);      \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(      \
-    name, functor_class, attr1, attr2)                  \
+    name, functor_class, attr1, attr2)                       \
-  template <typename T, typename Context>               \
+  template <typename T, typename Context>                    \
-  void name##GradKernel(const Context& dev_ctx,         \
+  void name##GradKernel(const Context& dev_ctx,              \
-                        const DenseTensor& x,           \
+                        const DenseTensor& x,                \
-                        const DenseTensor& dout,        \
+                        const DenseTensor& dout,             \
-                        float attr1,                    \
+                        float attr1,                         \
-                        float attr2,                    \
+                        float attr2,                         \
-                        DenseTensor* dx) {              \
+                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                           \
+    funcs::functor_class<T> functor;                         \
-    auto attrs = functor.GetAttrs();                    \
+    auto attrs = functor.GetAttrs();                         \
-    *(attrs[0].second) = attr1;                         \
+    *(attrs[0].second) = attr1;                              \
-    *(attrs[1].second) = attr2;                         \
+    *(attrs[1].second) = attr2;                              \
-    ActivationGradImpl<T, Context, functor_class<T>>(   \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
-        dev_ctx, &x, nullptr, &dout, dx, functor);      \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
  }
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
  template <typename T, typename Context>                             \
  void name##GradKernel(const Context& dev_ctx,                       \
                        const DenseTensor& out,                       \
                        const DenseTensor& dout,                      \
                        DenseTensor* dx) {                            \
-    functor_class<T> functor;                                         \
+    funcs::functor_class<T> functor;                                  \
-    ActivationGradImpl<T, Context, functor_class<T>>(                 \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(          \
        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(    \
-    name, functor_class, attr)                            \
+    name, functor_class, attr)                               \
-  template <typename T, typename Context>                 \
+  template <typename T, typename Context>                    \
-  void name##GradKernel(const Context& dev_ctx,           \
+  void name##GradKernel(const Context& dev_ctx,              \
-                        const DenseTensor& out,           \
+                        const DenseTensor& out,              \
-                        const DenseTensor& dout,          \
+                        const DenseTensor& dout,             \
-                        float attr,                       \
+                        float attr,                          \
-                        DenseTensor* dx) {                \
+                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                             \
+    funcs::functor_class<T> functor;                         \
-    auto attrs = functor.GetAttrs();                      \
+    auto attrs = functor.GetAttrs();                         \
-    *(attrs[0].second) = attr;                            \
+    *(attrs[0].second) = attr;                               \
-    ActivationGradImpl<T, Context, functor_class<T>>(     \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
-        dev_ctx, nullptr, &out, &dout, dx, functor);      \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
  }
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
-                                               funcs::LeakyReluGradFunctor,
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               LeakyReluGradFunctor,
                                               alpha);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
-    ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold);
+                                               ThresholdedReluGradFunctor,
+                                               threshold);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
-                                               funcs::BReluGradFunctor,
+                                               SoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               HardShrinkGradFunctor,
+                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               BReluGradFunctor,
                                               t_min,
                                               t_max);
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
+  auto out_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
+  auto dx_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
+  auto* place = dev_ctx.eigen_device();
+  if (alpha > 0) {
+    funcs::ELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  } else {
+    funcs::ELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  }
+}
 }  // namespace phi
 PD_REGISTER_KERNEL(
@@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                          ReluDoubleGradKernel)
@@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
                                          TanhDoubleGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
                                          LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
 PD_REGISTER_KERNEL(tanh_triple_grad,
                   CPU,

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -19,78 +19,93 @@ limitations under the License. */
 namespace phi {
-#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)               \
-  template <typename T, typename Context>                                \
+  template <typename T, typename Context>                               \
-  void name##Kernel(                                                     \
+  void name##Kernel(                                                    \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
-    functor_class functor;                                               \
+    funcs::functor_class<T> functor;                                    \
-    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
  }
-#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr)     \
+#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
-  template <typename T, typename Context>                                   \
+  template <typename T, typename Context>                               \
-  void name##Kernel(const Context& dev_ctx,                                 \
+  void name##Kernel(const Context& dev_ctx,                             \
-                    const DenseTensor& x,                                   \
+                    const DenseTensor& x,                               \
-                    float attr,                                             \
+                    float attr,                                         \
-                    DenseTensor* out) {                                     \
+                    DenseTensor* out) {                                 \
-    functor_class<T> functor;                                               \
+    funcs::functor_class<T> functor;                                    \
-    auto attrs = functor.GetAttrs();                                        \
+    auto attrs = functor.GetAttrs();                                    \
-    *(attrs[0].second) = attr;                                              \
+    *(attrs[0].second) = attr;                                          \
-    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
  }
-#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(                               \
+#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(            \
-    name, functor_class, attr1, attr2)                                      \
+    name, functor_class, attr1, attr2)                   \
-  template <typename T, typename Context>                                   \
+  template <typename T, typename Context>                \
-  void name##Kernel(const Context& dev_ctx,                                 \
+  void name##Kernel(const Context& dev_ctx,              \
-                    const DenseTensor& x,                                   \
+                    const DenseTensor& x,                \
-                    float attr1,                                            \
+                    float attr1,                         \
-                    float attr2,                                            \
+                    float attr2,                         \
-                    DenseTensor* out) {                                     \
+                    DenseTensor* out) {                  \
-    functor_class<T> functor;                                               \
+    funcs::functor_class<T> functor;                     \
-    auto attrs = functor.GetAttrs();                                        \
+    auto attrs = functor.GetAttrs();                     \
-    *(attrs[0].second) = attr1;                                             \
+    *(attrs[0].second) = attr1;                          \
-    *(attrs[1].second) = attr2;                                             \
+    *(attrs[1].second) = attr2;                          \
-    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                       \
  }
-DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha)
+DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
-                                     funcs::ThresholdedReluFunctor,
+                                     ThresholdedReluFunctor,
                                     threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
 #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
-  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {}
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
-PD_REGISTER_ACTIVATION_KERNEL(sin, Sin)
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cos, Cos)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(tan, Tan)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acos, Acos)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asin, Asin)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atan, Atan)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
--- a/paddle/phi/kernels/cpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
--- a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
--- a/paddle/phi/kernels/cpu/lgamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 #include "paddle/phi/kernels/lgamma_kernel.h"
+#include <unsupported/Eigen/SpecialFunctions>
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/for_range.h"

--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -23,7 +23,7 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
 #include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 namespace phi {

--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc
--- a/paddle/phi/kernels/cpu/qr_kernel.cc
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
--- a/paddle/phi/kernels/cpu/reduce_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_kernel.cc
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
--- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
--- a/paddle/phi/kernels/cpu/roll_kernel.cc
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
--- a/paddle/phi/kernels/cpu/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc
--- a/paddle/phi/kernels/cpu/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
--- a/paddle/phi/kernels/funcs/tril_triu_compute.h
+++ b/paddle/phi/kernels/funcs/tril_triu_compute.h
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
--- a/paddle/phi/kernels/gpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
--- a/paddle/phi/kernels/gpu/reduce_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_all_kernel.cu
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
--- a/paddle/phi/kernels/gpu/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_kernel.cu
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
--- a/paddle/phi/kernels/grid_sample_grad_kernel.h
+++ b/paddle/phi/kernels/grid_sample_grad_kernel.h
--- a/paddle/phi/kernels/reduce_max_kernel.h
+++ b/paddle/phi/kernels/reduce_max_kernel.h
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
--- a/paddle/phi/kernels/reduce_all_kernel.h
+++ b/paddle/phi/kernels/reduce_all_kernel.h
--- a/paddle/phi/kernels/reduce_any_kernel.h
+++ b/paddle/phi/kernels/reduce_any_kernel.h
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
--- a/paddle/phi/kernels/reduce_min_kernel.h
+++ b/paddle/phi/kernels/reduce_min_kernel.h
--- a/paddle/phi/kernels/multiplex_kernel.h
+++ b/paddle/phi/kernels/multiplex_kernel.h
--- a/paddle/phi/kernels/qr_kernel.h
+++ b/paddle/phi/kernels/qr_kernel.h
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
--- a/paddle/phi/kernels/reduce_kernel.cc
+++ b/paddle/phi/kernels/reduce_kernel.cc
--- a/paddle/phi/kernels/reduce_kernel.h
+++ b/paddle/phi/kernels/reduce_kernel.h
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
--- a/paddle/phi/kernels/roi_align_grad_kernel.h
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
--- a/paddle/phi/kernels/roll_grad_kernel.h
+++ b/paddle/phi/kernels/roll_grad_kernel.h
--- a/paddle/phi/kernels/roll_kernel.h
+++ b/paddle/phi/kernels/roll_kernel.h
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
--- a/paddle/phi/kernels/tril_triu_grad_kernel.h
+++ b/paddle/phi/kernels/tril_triu_grad_kernel.h
--- a/paddle/phi/kernels/tril_triu_kernel.h
+++ b/paddle/phi/kernels/tril_triu_kernel.h
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ b/paddle/fluid/operators/tril_triu_op.cu
--- a/paddle/phi/ops/compat/index_select_sig.cc
+++ b/paddle/phi/ops/compat/index_select_sig.cc
--- a/paddle/phi/ops/compat/multiplex_sig.cc
+++ b/paddle/phi/ops/compat/multiplex_sig.cc
--- a/paddle/phi/ops/compat/qr_sig.cc
+++ b/paddle/phi/ops/compat/qr_sig.cc
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
--- a/paddle/phi/ops/compat/roi_align_sig.cc
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
--- a/paddle/phi/ops/compat/tile_sig.cc
+++ b/paddle/phi/ops/compat/tile_sig.cc
--- a/paddle/phi/ops/compat/tril_triu_sig.cc
+++ b/paddle/phi/ops/compat/tril_triu_sig.cc
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py