Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

c1c6b869 · phlrain · 733d3109 · 23c036d6 · c1c6b869 · c1c6b869
176 changed file
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -1256,7 +1256,7 @@ if __name__ == "__main__":
            # Node Definition Generation
            definition_declaration_pair = GenerateForwardDefinition(
                fwd_api_name, bwd_api_name, forward_inputs_position_map,
-                forward_outputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, orig_forward_attrs_list,
                backward_fwd_input_map, backward_grad_input_map,
                backward_grad_output_map, backward_attrs_list, optional_inputs,
                intermediate_outputs)
@@ -1268,7 +1268,7 @@ if __name__ == "__main__":
            # For python-level API dispatch
            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
                                      forward_outputs_position_map,
-                                      forward_attrs_list)
+                                      orig_forward_attrs_list)
        if len(namespace) > 0:
            forward_definition_str += f"""namespace {namespace} {{

--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
    return;
  }
+  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
+    return;
+  }
  // NOTE(yy): TransDataDevice should wait for computation of input.
  if (!platform::is_cuda_pinned_place(in.place())) {
    platform::DeviceContextPool::Instance().Get(in.place())->Wait();

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
  std::unordered_map<std::string, std::pair<VarDesc *, int>>
      name_to_desc_block_id;
+  block_id_ = block.ID();
  const BlockDesc *block_var_visible = &block;
  while (block_var_visible != nullptr) {
    for (auto *var : block_var_visible->AllVars()) {

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -230,6 +230,7 @@ class Graph {
    auto *x =
        AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -245,6 +246,7 @@ class Graph {
                     "The OpDesc used to create operator node is null."));
    auto *x = AddNode(new ir::Node(op_desc));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -263,6 +265,7 @@ class Graph {
        num_node_created_);
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }
@@ -276,6 +279,7 @@ class Graph {
    }
    auto *x = AddNode(new ir::Node(name, type, block_id_));
    x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
    return x;
  }

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -125,6 +125,7 @@ class Node {
  // Only use this for auto parallel.
  // A node does not have original desc if the return is zero.
  uint64_t OriginalDescId() const { return original_desc_id_; }
+  int GraphId() const { return graph_id_; }
  bool IsOp() const { return type_ == Type::kOperation; }
  bool IsVar() const { return type_ == Type::kVariable; }
@@ -246,10 +247,12 @@ class Node {
  // Store the original id of var desc or op desc.
  // Only use this for auto parallel.
  uint64_t original_desc_id_{0};
+  int graph_id_{-1};
 private:
  // ID can only set by a Graph.
  void SetId(int id) { id_ = id; }
+  void SetGraphId(int graph_id) { graph_id_ = graph_id; }
  // desc_order can only set by a Graph when constructing a Graph from a
  // BlockDesc.

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
  if (platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() ||
       !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
@@ -1470,18 +1471,37 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_XPU_KP
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
    bool use_xpu_kp_kernel_rt =
        FLAGS_run_kp_kernel &&
        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
    bool use_xpu_kp_kernel_debug =
        paddle::platform::is_in_xpu_kpwhite_list(type_);
-  if (platform::is_xpu_place(expected_kernel_key.place_) &&
+    if (use_xpu_kp_kernel_rt) {
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
      expected_kernel_key.library_type_ = LibraryType::kKP;
      kernel_iter = kernels.find(expected_kernel_key);
      VLOG(3) << "using XPU KP kernel: " << type_
              << ", using_kernel_key:" << expected_kernel_key;
    }
+    bool is_xpu_unsupport =
+        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(type_));
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << type_
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+  }
 #endif
 #ifdef PADDLE_WITH_IPU

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
-    int32_t size;
+    int32_t size = -1;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
+                                           "Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
+                                   "Tensor desc size should >= 0"));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
 #elif defined(PADDLE_WITH_ASCEND_CL)
  auto unsupported_ops_npu_fp16 = std::get<2>(
      OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                unsupported_ops_xpu_bf16.end());
+#elif defined(PADDLE_WITH_MLU)
+  auto unsupported_ops_mlu_fp16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
+                                unsupported_ops_mlu_fp16.end());
+  auto unsupported_ops_mlu_bf16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
+  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
+                                unsupported_ops_mlu_bf16.end());
 #endif
  VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
          << unsupported_fp16_ops_->size() << " "
@@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
      paddle::platform::is_npu_place(place) ||
      paddle::platform::is_npu_pinned_place(place)) {
    // CudaPinndePlace is added for varbase created by dataloader

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
  auto& kernels = kernels_iter->second;
  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() || is_xpu_unsupport)) {
    VLOG(3) << "missing XPU kernel: " << op.Type()
@@ -243,11 +243,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
 #ifdef PADDLE_WITH_XPU_KP
-  expected_kernel_key.place_ = platform::XPUPlace();
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
    bool use_xpu_kp_kernel_rt =
        FLAGS_run_kp_kernel &&
        paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
@@ -259,14 +258,22 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
    if (use_xpu_kp_kernel_debug) {
      VLOG(3) << "xpu_kp using debug mode ";
    }
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    if (is_xpu_kp_support) {
-    expected_kernel_key.place_ = platform::XPUPlace();
      expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
      kernel_iter = kernels.find(expected_kernel_key);
      VLOG(3) << "using XPU KP kernel: " << op.Type()
              << ", using_kernel_key:" << expected_kernel_key;
    }
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext(
  }
  for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
    if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
      if (attrs.find(attr_names[i]) !=
          attrs.end()) {  // shape is in the attribute

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
 REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
 REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
+                       HardShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
+                       SoftShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                       TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
 /* ==========================    sigmoid register  =============================
 */
@@ -1626,22 +1633,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(elu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ========================    logit  register     ============================

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu)
 USE_PHI_FUNCTOR(ThresholdedRelu)
 USE_PHI_FUNCTOR(LeakyRelu)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+USE_PHI_FUNCTOR(HardShrink)
+USE_PHI_FUNCTOR(SoftShrink)
+USE_PHI_FUNCTOR(TanhShrink)
+USE_PHI_FUNCTOR(Silu)
+USE_PHI_FUNCTOR(ELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+template <typename T>
+using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
@@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-// silu(x) = x / (1 + exp(-x))
-template <typename T>
-struct SiluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    out.device(d) = x * temp;
-  }
-};
-// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
-template <typename T>
-struct SiluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
-    auto temp2 = x * (-x).exp();                  // x*e^(-x)
-    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
-                           (static_cast<T>(1) + (temp2 / temp1)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 || temp2).template cast<T>();
-  }
-};
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
-  }
-};
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 1: alpha >= 0
-    // dx = dout, if out > 0
-    // dx = dout * (out + alpha), if out <= 0
-    dx.device(d) = (out > static_cast<T>(0))
-                       .select(dout, dout * (out + static_cast<T>(alpha)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 2: alpha < 0
-    // dx = dout, if x > 0
-    // dx = dout * (out + alpha), if x <=0
-    dx.device(d) = (x > static_cast<T>(0))
-                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename DeviceContext, typename T>
 class ELUGradKernel : public framework::OpKernel<T> {
 public:
@@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
-      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
@@ -2152,9 +1952,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace paddle
 #define FOR_EACH_ACTIVATION_OP(__macro)                                      \
-  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
@@ -2167,8 +1965,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                     \
          HardSigmoidGradFunctor);                                           \
  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \

--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x / (one + exp(-x)));
-  }
-};
-template <typename T>
-struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = one / (one + exp(-x));
-    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  // softshrink(x) = x - lambda, if x > lambda;
-  //                 x + lambda, if x < -lambda;
-  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T x) const {
-    T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return temp1 * (x - l) + temp2 * (x + l);
-  }
-};
-template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T l = static_cast<T>(lambda);
-    return (x >= -l && x <= l) ? zero : dout;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x - tanh(x));
-  }
-};
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * tanh(x) * tanh(x));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : dout;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
@@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // elu(x) = x, if x > 0
-  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
-    CT res = x > zero ? x : temp;
-    return static_cast<T>(res);
-  }
-};
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // case 1: alpha >= 0
-  // dx = dout, if out > 0
-  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType a = static_cast<MPType>(alpha);
-    MPType out_pos = static_cast<MPType>(out > zero);
-    MPType out_neg = static_cast<MPType>(out <= zero);
-    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  // case 2: alpha < 0
-  // dx = dout, if x > 0
-  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType x_pos = static_cast<MPType>(x > zero);
-    MPType x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename DeviceContext, typename T>
-class ELUGradCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    const float alpha = ctx.Attr<float>("alpha");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {d_out, out};
-    std::vector<framework::Tensor*> outs = {d_x};
-    if (alpha > 0) {
-      CudaELUGradFunctor<T> functor;
-      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    } else {
-      CudaELUGradNegativeAlphaFunctor<T> functor;
-      functor.alpha = alpha;
-      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
 template <typename T>
 struct CudaCELUFunctor : public BaseActivationFunctor<T> {
  using CT = typename details::MPTypeTrait<T>::Type;
@@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh)
 USE_PHI_FUNCTOR(CudaBRelu)
 USE_PHI_FUNCTOR(CudaLeakyRelu)
 USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaHardShrink)
+USE_PHI_FUNCTOR(CudaSoftShrink)
+USE_PHI_FUNCTOR(CudaTanhShrink)
+USE_PHI_FUNCTOR(CudaSilu)
+USE_PHI_FUNCTOR(CudaELU)
+template <typename T>
+using CudaELUGradNegativeAlphaFunctor =
+    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
 }  // namespace operators
 }  // namespace paddle
@@ -1158,26 +937,6 @@ namespace plat = paddle::platform;
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                    ops::grad_functor<plat::bfloat16>>);
-/* ======================== elu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                   ops::CudaELUFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaELUFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaELUFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ======================== celu register  ============================ */
@@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
          CudaLogSigmoidGradFunctor);                                         \
  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \

--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 namespace paddle {
@@ -20,6 +21,8 @@ namespace operators {
 template <typename T>
 class MLUBatchNormOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto &place = ctx.GetPlace();
@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
    // alloc memory
    y->mutable_data<T>(place);
-    mean_out->mutable_data<T>(place);
+    mean_out->mutable_data<MPDType>(place);
-    variance_out->mutable_data<T>(place);
+    variance_out->mutable_data<MPDType>(place);
-    saved_mean->mutable_data<T>(place);
+    saved_mean->mutable_data<MPDType>(place);
-    saved_variance->mutable_data<T>(place);
+    saved_variance->mutable_data<MPDType>(place);
    Tensor transformed_x;
    Tensor transformed_y;
@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 template <typename T>
 class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<Tensor>("X");
@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
    auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
    auto d_x_tmp =
        ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp =
+    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
+        scale->dims(), dev_ctx);
    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
    if (d_x == nullptr) {
      d_x = &d_x_tmp;
@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
    const auto &place = ctx.GetPlace();
    d_x->mutable_data<T>(place);
-    d_scale->mutable_data<T>(place);
+    d_scale->mutable_data<MPDType>(place);
-    d_bias->mutable_data<T>(place);
+    d_bias->mutable_data<MPDType>(place);
    use_global_stats = is_test || use_global_stats;

--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -21,14 +23,6 @@ namespace operators {
 class CumprodOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
 };
 class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                  ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
+                  CumprodInferShapeFunctor);
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -15,9 +15,14 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherOp should not be null."));
-    auto index_dims = ctx->GetInputDim("Index");
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto input_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Axis") || axis == 0) {
-      // if HasInput("Axis"), we can not obtain correct shape of output
-      int batch_size = index_dims[0];
-      framework::DDim output_dims(input_dim);
-      output_dims[0] = batch_size;
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int index_size = index_dims[0];
-      std::vector<int> out_dim_vec;
-      for (int i = 0; i < axis; i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      out_dim_vec.push_back(index_size);
-      for (int i = axis + 1; i < input_dim.size(); i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      auto output_dims = phi::make_ddim(out_dim_vec);
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
+                            PD_INFER_META(phi::GatherInferMeta));
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                  ops::GatherGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherGradOpMaker<paddle::imperative::OpBase>,
+                  GatherInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInferer);
+                  ops::GatherGradNoNeedBufferVarInferer,
+                  GatherGradInferShapeFunctor);
 REGISTER_OP_VERSION(gather)
    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",

--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_VERSION(grid_sampler)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-template <typename T>
-static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
-                                                  int sW, int H, int W,
-                                                  T delta) {
-  if (in_bounds(h, w, H, W)) {
-    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T _unnormalize(T coord, int size,
-                                                 bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
-  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
-}
-template <typename T>
-static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
-                                                    int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<T>(0);
-  }
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = fabs(in - min);
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T compute_positions(T coord, int size,
-                                                      PaddingMode padding_mode,
-                                                      bool align_corners) {
-  coord = _unnormalize<T>(coord, size, align_corners);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes(coord, size - 1);
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_indexes(coord, -1, 2 * size - 1);
-    }
-    coord = clip_indexes(coord, size - 1);
-  }
-  return coord;
-}
-template <typename T>
-static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
-                                                           bool align_corners,
-                                                           T* grad_in) {
-  if (align_corners) {
-    *grad_in = static_cast<T>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    *grad_in = static_cast<T>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
-                                                           T* grad_in) {
-  if (in <= static_cast<T>(0)) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  } else {
-    T max = static_cast<T>(clip_limit - 1);
-    if (in >= max) {
-      *grad_in = static_cast<T>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<T>(1);
-      return in;
-    }
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T
-reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  }
-  int grad_in_mult_;
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<T>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<T>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<T>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-template <typename T>
-static __forceinline__ __device__ T
-compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
-                            bool align_corners, T* grad_in) {
-  T grad_clip, grad_refl;
-  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
-    } else {
-      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
-    }
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_refl * grad_clip;
-  }
-  return coord;
-}
-template <typename T>
-__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
-                                        int out_h, int out_w, int in_h,
-                                        int in_w, const T* input, const T* grid,
-                                        T* output, const Mode mode,
-                                        const PaddingMode padding_mode,
-                                        bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int out_sN = out_c * out_h * out_w;
-  int out_sC = out_h * out_w;
-  int out_sH = out_w;
-  int out_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-    ix = compute_positions(ix, in_w, padding_mode, align_corners);
-    iy = compute_positions(iy, in_h, padding_mode, align_corners);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<T>(0);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
-          *out_ptr_NCHW =
-              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<T>(0);
-        }
-      }
-    }
-  }
-}
-template <typename T>
-class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-            << "; out_w: " << out_w;
-    auto* output = ctx.Output<Tensor>("Output");
-    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
-            << "; " << output->dims()[2] << "; " << output->dims()[3];
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sample_cuda_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
-        grid->data<T>(), output_data, mode, padding_mode, align_corners);
-  }
-};
-template <typename T>
-__global__ void grid_sampler_cuda_backward_kernel(
-    const int nthreads, const T* grad_output, const T* input, const T* grid,
-    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
-    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
-    bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int gOut_sN = out_c * out_h * out_w;
-  int gOut_sC = out_h * out_w;
-  int gOut_sH = out_w;
-  int gOut_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-    T gix_mult, giy_mult;
-    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
-                                     &gix_mult);
-    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
-                                     &giy_mult);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-      T gix = static_cast<T>(0), giy = static_cast<T>(0);
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      int inp_offset_NC = n * inp_sN;
-      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
-               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        T gOut = grad_output[gOut_offset];
-        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
-                   nw * gOut);
-        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
-                   ne * gOut);
-        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
-                   sw * gOut);
-        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
-                   se * gOut);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      for (int c = 0; c < out_c;
-           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
-                   in_w, grad_output[gOut_offset]);
-      }
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = static_cast<T>(0);
-        gGrid_ptr_NHW[1] = static_cast<T>(0);
-      }
-    }
-  }
-}
-template <typename T>
-class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-        input_grad, static_cast<T>(0));
-    T* grid_grad_data = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sampler_cuda_backward_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
-        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
-        padding_mode, align_corners);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
-                        ops::GridSampleOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
-                        ops::GridSampleGradOpCUDAKernel<float>,
-                        ops::GridSampleGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <iostream>
-#include <string>
-#include <utility>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-enum class Mode {
-  bilinear,
-  nearest,
-};
-enum class PaddingMode { zeros, border, reflect };
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-template <typename T>
-static inline bool isInBound(T x, T y, T x_max, T y_max) {
-  if (x < 0 || x > x_max || y < 0 || y > y_max) {
-    return false;
-  }
-  return true;
-}
-template <typename T>
-static inline void unnormalize(const platform::CPUDeviceContext& ctx,
-                               Tensor* grid_slice,
-                               const int max_val,  // height-1 or width-1
-                               bool align_corners) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  if (!align_corners) {
-    auto factor = static_cast<T>((max_val + 1) * 0.5);
-    grid_slice_t.device(place) =
-        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
-  } else {
-    auto factor = static_cast<T>(max_val * 0.5);
-    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
-  }
-}
-template <typename T>
-static inline void clip(const platform::CPUDeviceContext& ctx,
-                        Tensor* grid_slice,
-                        const int max_val,  // height-1 or width-1
-                        bool align_corners, std::string padding_mode) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  if (padding_mode == "border") {
-    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                     .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                       .cwiseMin(static_cast<T>(max_val));
-    }
-  }
-}
-template <typename T>
-static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
-                                const int max_val,  // height-1 or width-1
-                                bool align_corners, std::string padding_mode,
-                                Tensor* grid_slice, Tensor* grid_scale) {
-  auto& place = *ctx.eigen_device();
-  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  auto factor = static_cast<T>(max_val * 0.5);
-  if (!align_corners) {
-    factor = static_cast<T>((max_val + 1) * 0.5);
-  }
-  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
-  if (padding_mode == "border") {
-    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
-    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
-                   .cwiseMin(static_cast<T>(max_val));
-    auto in_bound = (res == grid_slice_t);
-    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
-    grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto is_neg = (grid_slice_t < static_cast<T>(0));
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>());
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      auto reflected =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      auto clipped = reflected.cwiseMax(static_cast<T>(0))
-                         .cwiseMin(static_cast<T>(max_val));
-      auto in_bound = (clipped == reflected).template cast<T>();
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
-          in_bound;
-      grid_slice_t.device(place) = clipped;
-    }
-  }
-}
-template <typename T>
-static void calcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, const int in_h,
-                              const int in_w, bool align_corners,
-                              std::string padding_mode, Tensor* grid_x,
-                              Tensor* grid_y) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
-  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
-}
-template <typename T>
-static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
-                                      const Tensor& grid, const int in_h,
-                                      const int in_w, bool align_corners,
-                                      std::string padding_mode, Tensor* grid_x,
-                                      Tensor* grid_y, Tensor* grid_x_scale,
-                                      Tensor* grid_y_scale) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
-                  grid_x_scale);
-  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
-                  grid_y_scale);
-}
-template <typename T>
-static void getGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-  const int out_h = x.dims()[1];
-  const int out_w = x.dims()[2];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T>
-static void allNeigbors(const platform::CPUDeviceContext& ctx,
-                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
-                        Tensor* y_s,  // positions
-                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
-                        Tensor* d_s,  // distance
-                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
-                        Tensor* v_es) {  // values
-  auto& place = *ctx.eigen_device();
-  const int c = input.dims()[1];
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
-  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
-  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
-  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-  x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + static_cast<T>(1);
-  y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + static_cast<T>(1);
-  // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
-  d_w_t.device(place) = grid_x_t - x_w_t;
-  d_e_t.device(place) = x_e_t - grid_x_t;
-  d_n_t.device(place) = grid_y_t - y_n_t;
-  d_s_t.device(place) = y_s_t - grid_y_t;
-  // calc 4 corner points value
-  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
-  getGridPointValue<T>(input, v_en, *x_e, *y_n);
-  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
-  getGridPointValue<T>(input, v_es, *x_e, *y_s);
-}
-template <typename T>
-static void bilinearInter(const platform::CPUDeviceContext& ctx,
-                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                          Tensor* out) {
-  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
-                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-  auto d_w_scaled_t =
-      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_e_scaled_t =
-      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_n_scaled_t =
-      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_s_scaled_t =
-      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-  auto output_t = EigenTensor<T, 4>::From(*out);
-  // bilinear interpolaetion by 4 corner points
-  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                           v_en_t * d_w_scaled_t * d_s_scaled_t +
-                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                           v_es_t * d_w_scaled_t * d_n_scaled_t;
-}
-template <typename T>
-static void nearestInter(const platform::CPUDeviceContext& ctx,
-                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                         Tensor* out) {
-  auto& place = *ctx.eigen_device();
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-  grid_x_t = grid_x_t.round();
-  grid_y_t = grid_y_t.round();
-  getGridPointValue<T>(input, out, *grid_x, *grid_y);
-}
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T>
-static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
-                               const Tensor& input, const Tensor& output_grad,
-                               Tensor* grid_x, Tensor* grid_y,
-                               Tensor* grid_x_scale, Tensor* grid_y_scale,
-                               Tensor* input_grad, Tensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-  allNeigbors<T>(ctx, input,
-                 grid_x,  // grid_x
-                 grid_y,  // grid_y
-                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
-                 &v_ws, &v_es);
-  // gather output grad value to input grad by corner point coords and weight
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  if (grid_grad != nullptr) {
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    auto grid_grad_x_t =
-        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
-    auto grid_grad_y_t =
-        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-    //  const T x_max = static_cast<T>(in_w - 1);
-    //  const T y_max = static_cast<T>(in_h - 1);
-    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
-    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
-    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
-    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * out_h * out_w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
-    }
-  }
-}
-template <typename DeviceContext, typename T>
-class GridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-    Tensor grid_x, grid_y;
-    calcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y);
-    if (mode == "bilinear") {
-      bilinearInter<T>(
-          ctx.template device_context<platform::CPUDeviceContext>(), *input,
-          &grid_x, &grid_y, output);
-    } else if (mode == "nearest") {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      getGridPointValue<T>(*input, output, grid_x, grid_y);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class GridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), input_grad,
-        static_cast<T>(0));
-    Tensor* grid_grad = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), grid_grad,
-          static_cast<T>(0));
-    }
-    Tensor grid_x, grid_y;
-    Tensor grid_x_scale, grid_y_scale;
-    calcGridLocationsWithGrad<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
-        &grid_y_scale);
-    if (mode == "bilinear") {
-      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
-                            *input, *output_grad, &grid_x, &grid_y,
-                            &grid_x_scale, &grid_y_scale, input_grad,
-                            grid_grad);
-    } else {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 #include "paddle/fluid/operators/index_select_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of IndexSelectOp should not be null."));
-    auto input_dim = ctx->GetInputDim("X");
-    auto index_dim = ctx->GetInputDim("Index");
-    auto dim = ctx->Attrs().Get<int>("dim");
-    PADDLE_ENFORCE_EQ(
-        dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            input_dim.size(), input_dim.size() - 1, dim));
-    PADDLE_ENFORCE_EQ(
-        index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "The 'shape' of Input(Index) must be 1-D tensor. "
-                  "But received: the 'shape' of Input(Index) is [%s], "
-                  "the dimension of Input(Index) is [%d].",
-                  index_dim, index_dim.size()));
-    PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
-                      platform::errors::InvalidArgument(
-                          "The length of Input(Index) can't be 0."));
-    auto output_dim = phi::vectorize(input_dim);
-    if (dim < 0) {
-      dim += input_dim.size();
-    }
-    output_dim[dim] = index_dim[0];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSelectInferMeta));
 REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
                  ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
+                  IndexSelectInferShapeFunctor);
 REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
                  ops::IndexSelectGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_select,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/index_select_op.cu
+++ b/paddle/fluid/operators/index_select_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input, T* output,
-                                         const IndexT* index, int64_t N,
-                                         int64_t stride, int64_t size,
-                                         int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  output[idx] = input[input_idx];
-}
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index, int64_t nums,
-                                              int64_t N, int64_t stride,
-                                              int64_t size, int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-template <typename DeviceContext, typename T>
-class IndexSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* out = context.Output<LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in->dims();
-    auto output_dim = out->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = out->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* output_grad_data = output_grad->data<T>();
-    auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in_grad->dims();
-    auto output_dim = output_grad->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    int64_t numel = in_grad->numel();
-    int64_t index_nums = index->numel();
-    int64_t out_nums = output_grad->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_grad_cuda_kernel<T, int64_t><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_select,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
  output->Resize(output_dim);
 }
-template <typename DeviceContext, typename T>
-class IndexSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto inputs = *context.Input<framework::LoDTensor>("X");
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* output = context.Output<framework::LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += inputs.dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
-                                              dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
-                                                  output, dim);
-    }
-  }
-};
 template <typename DeviceContext, typename T, class Enable = void>
 struct IndexSelectAdd {
  void operator()(const framework::ExecutionContext& ctx, int slice_size,
@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
  x_grad->Resize(output_dim);
 }
-template <typename DeviceContext, typename T>
-class IndexSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* out_grad =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_grad->dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
-                                                  x_grad, dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
-                                                      *index, x_grad, dim);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -14,10 +14,13 @@
 #include <cmath>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", input_dim);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
+                            PD_INFER_META(phi::ValueCompareInferMeta));
 REGISTER_OPERATOR(
    isclose, ops::IscloseOp, ops::IscloseOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::IscloseOpVarTypeInference);
+    ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -11,7 +11,9 @@
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -21,44 +23,6 @@ using framework::Tensor;
 class KLDivLossOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) rank and Input(Target) rank should be "
-                          "same, but received X rank(%d) != Target rank(%d)",
-                          dim_x.size(), dim_target.size()));
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            dim_x[i], dim_target[i],
-            platform::errors::InvalidArgument(
-                "Input(X) and Input(Target) should in same shape. but received "
-                "X dimension[%d](%d) != Target dimension[%d](%d)",
-                i, dim_x[i], i, dim_target[i]));
-      }
-    }
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-    auto reduction_valid = "mean" == reduction || "sum" == reduction ||
-                           "batchmean" == reduction || "none" == reduction;
-    PADDLE_ENFORCE_EQ(
-        reduction_valid, true,
-        platform::errors::InvalidArgument(
-            "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
@@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
+                            PD_INFER_META(phi::KLDivInferMeta));
 REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                  ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
+                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
+                  KLDivInferShapeFunctor);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                  ops::KLDivLossGradNoNeedBufferVarInferer);
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
  const auto W = udims[udims.size() - 1];
  auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
  platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
+                                               L_dataptr);
  x_for_range(tril_computer);
-  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
+  phi::funcs::TrilTriuCompute<T> triu_computer(
-                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
  x_for_range(triu_computer);
  // set L's diagonal 1
@@ -532,14 +533,14 @@ class LUGradKernel : public framework::OpKernel<T> {
    auto phil_rank = LmHdims.size();
    auto phiu_rank = UmHdims.size();
    platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
-    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
+    phi::funcs::TrilTriuCompute<T> tril_computer(
-                                     LmHdims[phil_rank - 2],
+        phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
        LmHdims[phil_rank - 1], phi_L.data<T>());
    l_for_range(tril_computer);
    platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
-    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
+    phi::funcs::TrilTriuCompute<T> triu_computer(
-                                     UmHdims[phiu_rank - 2],
+        phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
        UmHdims[phiu_rank - 1], phi_U.data<T>());
    u_for_range(triu_computer);
@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
        const auto W = phidims[phidims.size() - 1];
        platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                      phi_complement.numel());
-        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
+        phi::funcs::TrilTriuCompute<T> tril_computer(
-                                         W, phi_complement_l.data<T>());
+            phi_complement.data<T>(), -1, true, H, W,
+            phi_complement_l.data<T>());
        x_for_range(tril_computer);
        Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
      const auto W = phidims[phidims.size() - 1];
      platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                    phi_complement.numel());
-      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
+      phi::funcs::TrilTriuCompute<T> triu_computer(
-                                       phi_complement_u.data<T>());
+          phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
      x_for_range(triu_computer);
      Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);

--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -16,7 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 namespace paddle {
 namespace operators {
@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
    auto W = ldims[ldims.size() - 1];
    auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
    platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
+                                                 L_dataptr);
    l_for_range(tril_computer);
    const auto udims = du->dims();
@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
    W = udims[udims.size() - 1];
    auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
    platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
+                                                 U_dataptr);
    u_for_range(triu_computer);
    auto xdims = dx->dims();

--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/multiplex_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                  ops::MultiplexGradMaker<paddle::framework::OpDesc>,
                  ops::MultiplexGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-        idx = i;
-      }
-    }
-    if (idx == -1UL) return;
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-        idx = i;
-      }
-    }
-    if (idx == -1UL) return;
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
-REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(
    qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
  return std::make_tuple(compute_q, reduced);
 }
-template <typename T>
-class QrCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool compute_q;
-    bool reduced_mode;
-    const Tensor& x = *context.Input<Tensor>("X");
-    Tensor& q = *context.Output<Tensor>("Q");
-    Tensor& r = *context.Output<Tensor>("R");
-    std::string mode = context.Attr<std::string>("mode");
-    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
-    auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
-    auto x_dims = x.dims();
-    int x_rank = x_dims.size();
-    int m = x_dims[x_rank - 2];
-    int n = x_dims[x_rank - 1];
-    int min_mn = std::min(m, n);
-    int k = reduced_mode ? min_mn : m;
-    int batch_size = numel / (m * n);
-    int x_stride = m * n;
-    int q_stride = m * k;
-    int r_stride = k * n;
-    auto* x_data = x.data<phi::dtype::Real<T>>();
-    T* q_data = nullptr;
-    if (compute_q) {
-      q_data = q.mutable_data<phi::dtype::Real<T>>(
-          context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-      memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-    }
-    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
-        context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    // Implement QR by calling Eigen
-    for (int i = 0; i < batch_size; ++i) {
-      const T* x_matrix_ptr = x_data + i * x_stride;
-      T* r_matrix_ptr = r_data + i * r_stride;
-      using EigenDynamicMatrix =
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
-      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
-      if (reduced_mode) {
-        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
-        auto r_matrix_view =
-            qr_top_matrix.template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      } else {
-        auto r_matrix_view =
-            qr.matrixQR().template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      }
-      if (compute_q) {
-        T* q_matrix_ptr = q_data + i * q_stride;
-        if (reduced_mode) {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        } else {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        }
-      }
-    }
-  }
-};
 template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                  ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                  ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_VERSION(roi_align)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-    int output_grad_size = out_grad->numel();
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/roll_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                  ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                  ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
 REGISTER_OP_VERSION(roll)
    .AddCheckpoint(

--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                  ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
                  ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
-                              plat::float16>);
--- a/paddle/fluid/operators/tril_triu_op.h
+++ b/paddle/fluid/operators/tril_triu_op.h
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TrilTriuCompute {
- public:
-  HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
-                             const int64_t H, const int64_t W, T* out)
-      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
-  HOSTDEVICE void operator()(int64_t idx) {
-    const int64_t row = (idx / W_) % H_;
-    const int64_t col = idx % W_;
-    const bool mask =
-        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
-    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
-  }
- private:
-  const T* in_;
-  const int diagonal_;
-  const bool lower_;
-  const int64_t H_;
-  const int64_t W_;
-  T* out_;
-};
-template <typename DeviceContext, typename T>
-class TrilTriuOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-    const auto& dims = x->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-    paddle::operators::TrilTriuCompute<T> tril_triu_computer(
-        x_data, diagonal, lower, H, W, out_data);
-    for_range(tril_triu_computer);
-  }
-};
-template <typename DeviceContext, typename T>
-class TrilTriuGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-    const auto& dims = d_out->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_out->numel()));
-    paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
-        dout_data, diagonal, lower, H, W, dx_data);
-    for_range(tril_triu_grad_computer);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
      .def("var", &Node::Var, return_value_policy::reference)
      .def("op", &Node::Op, return_value_policy::reference)
      .def("id", &Node::id)
+      .def("graph_id", &Node::GraphId)
      .def("original_desc_id", &Node::OriginalDescId)
      .def("is_op", &Node::IsOp)
      .def("is_var", &Node::IsVar)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -114,6 +114,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) {
  // stored in this static instance to avoid illegal memory access.
  m.def("clear_kernel_factory",
        []() { phi::KernelFactory::Instance().kernels().clear(); });
+  m.def("clear_device_manager", []() {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    phi::DeviceManager::Clear();
+#endif
+  });
  // NOTE(zjl): ctest would load environment variables at the beginning even
  // though we have not `import paddle.fluid as fluid`. So we add this API

--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
    return DenseTensorType::get(
        parser.getContext(), *targetType, *precisionType, *layoutType);
  }
+  if (keyword == "dense_tensor_map") {
+    return DenseTensorMapType::get(parser.getContext());
+  }
  // Todo: parse other type
  return mlir::Type();
 }
@@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type,
  }
  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
-  if (type.isa<infrt::DenseTensorType>()) {
+  if (type.isa<DenseTensorType>()) {
    auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
    os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
       << dense_tensor_type.getPrecision() << ", "
@@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type,
    return;
  }
+  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
+  if (type.isa<DenseTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
  llvm_unreachable("unknown infrt type.");
 }

--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -18,7 +18,7 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 int main(int argc, char** argv) {
  static llvm::cl::opt<std::string> input_file(

--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
  return platform_manager;
 }
+void DeviceManager::Clear() {
+  Instance().device_map_.clear();
+  Instance().device_impl_map_.clear();
+}
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
  std::vector<std::string> libraries;
  std::regex express(".*\\.so");

--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -158,6 +158,8 @@ class DeviceManager {
  static std::vector<size_t> GetDeviceList(const std::string& device_type);
+  static void Clear();
 private:
  DISABLE_COPY_AND_ASSIGN(DeviceManager);
  DeviceManager() {}

--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -24,6 +24,10 @@ limitations under the License. */
 namespace phi {
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                        const MetaTensor& y,
                                        const MetaTensor& weight,

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x,
  out->set_dtype(DataType::BOOL);
 }
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_target = label.dims();
+  PADDLE_ENFORCE_EQ(dim_x.size(),
+                    dim_target.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) rank and Input(Target) rank should be "
+                        "same, but received X rank(%d) != Target rank(%d)",
+                        dim_x.size(),
+                        dim_target.size()));
+  for (int i = 0; i < dim_x.size(); i++) {
+    if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[i],
+          dim_target[i],
+          phi::errors::InvalidArgument(
+              "Input(X) and Input(Target) should in same shape. but received "
+              "X dimension[%d](%d) != Target dimension[%d](%d)",
+              i,
+              dim_x[i],
+              i,
+              dim_target[i]));
+    }
+  }
+  auto reduction_valid = "mean" == reduction || "sum" == reduction ||
+                         "batchmean" == reduction || "none" == reduction;
+  PADDLE_ENFORCE_EQ(
+      reduction_valid,
+      true,
+      phi::errors::InvalidArgument(
+          "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
+  if ("none" == reduction) {
+    out->set_dims(dim_x);
+  } else {
+    out->set_dims({1});
+  }
+  out->set_dtype(x.dtype());
+}
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
  out->share_meta(x);
 }
@@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
  out->share_lod(x);
 }
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out) {
+  auto index_dims = index.dims();
+  if (index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(
+        index_dims[1],
+        1,
+        phi::errors::InvalidArgument(
+            "The last dim of index should be 1 when it is 2D, but we get %d",
+            index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The index should be 1D, when it is not 2D, but we get %d",
+            index_dims.size()));
+  }
+  auto input_dim = x.dims();
+  auto axis_v = axis.to<int>();
+  if (axis.FromTensor() || axis_v == 0) {
+    // if axis.FromTensor(), we can not obtain correct shape of output
+    int batch_size = index_dims[0];
+    phi::DDim output_dims(input_dim);
+    output_dims[0] = batch_size;
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  } else {
+    int index_size = index_dims[0];
+    std::vector<int> out_dim_vec;
+    for (int i = 0; i < axis_v; i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    out_dim_vec.push_back(index_size);
+    for (int i = axis_v + 1; i < input_dim.size(); i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    auto output_dims = phi::make_ddim(out_dim_vec);
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  }
+}
 void GatherNdInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       MetaTensor* out) {
@@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x,
  out->share_lod(y);
 }
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output) {
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  PADDLE_ENFORCE_EQ(
+      dim < input_dim.size() && dim >= (0 - input_dim.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+          input_dim.size(),
+          input_dim.size() - 1,
+          dim));
+  PADDLE_ENFORCE_EQ(
+      index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
+      true,
+      phi::errors::InvalidArgument(
+          "The 'shape' of Input(Index) must be 1-D tensor. "
+          "But received: the 'shape' of Input(Index) is [%s], "
+          "the dimension of Input(Index) is [%d].",
+          index_dim,
+          index_dim.size()));
+  PADDLE_ENFORCE_EQ(
+      index_dim[0] != 0,
+      true,
+      phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
+  auto output_dim = phi::vectorize(input_dim);
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  output_dim[dim] = index_dim[0];
+  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dtype(x.dtype());
+  output->set_layout(x.layout());
+  output->share_lod(x);
+}
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x,
  out->share_lod(y);
 }
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
 }  // namespace phi
 PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
@@ -28,12 +29,20 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AllValueCompareInferMeta(const MetaTensor& x,
                              const MetaTensor& y,
                              MetaTensor* out,
                              MetaConfig config = MetaConfig());
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 void BCELossInferMeta(const MetaTensor& input,
@@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                             int axis,
                             MetaTensor* out);
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out);
 void GatherNdInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       MetaTensor* out);
@@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x,
                          MetaTensor* out,
                          MetaConfig config = MetaConfig());
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output);
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x,
                              bool unitriangular,
                              MetaTensor* out);
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
 }  // namespace phi
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,23 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 void AdadeltaInferMeta(const MetaTensor& param,

--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,6 +27,8 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input,
  total_weight->set_dtype(input.dtype());
 }
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of RoisNum should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(X) in"
+                        "RoIAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(ROIs) "
+                                                 "in RoIAlignOp should be 2. "
+                                                 "But the rank of RoIs is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(ROIs) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoIAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoIAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoIAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -30,6 +30,8 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 //
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AccuracyInferMeta(const MetaTensor& out,
                       const MetaTensor& indice,
                       const MetaTensor& label,
@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input,
                         MetaTensor* total_weight,
                         MetaConfig config = MetaConfig());
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
  ReshapeInferMeta(x, shape, out, config);
 }
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
+  }
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
  auto in_dim = input.dims();
  out->set_dims(phi::make_ddim({in_dim.size()}));

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,6 +31,8 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void ArgMinMaxInferMeta(const MetaTensor& x,
                        int64_t axis,
@@ -164,6 +166,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                MetaTensor* out,
                                MetaConfig config = MetaConfig());
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
 void ShardIndexInferMeta(const MetaTensor& in,

--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -26,6 +26,23 @@ namespace phi {
                        const DenseTensor& dout,  \
                        DenseTensor* dx);
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx);
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
+  template <typename T, typename Context>                               \
+  void name##GradKernel(const Context& dev_ctx,                         \
+                        const DenseTensor& x,                           \
+                        const DenseTensor& dout,                        \
+                        float attr1,                                    \
+                        float attr2,                                    \
+                        DenseTensor* dx);
 #define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
  template <typename T, typename Context>           \
  void name##GradKernel(const Context& dev_ctx,     \
@@ -33,6 +50,14 @@ namespace phi {
                        const DenseTensor& dout,    \
                        DenseTensor* dx);
+#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
+  template <typename T, typename Context>                                \
+  void name##GradKernel(const Context& dev_ctx,                          \
+                        const DenseTensor& out,                          \
+                        const DenseTensor& dout,                         \
+                        float attr,                                      \
+                        DenseTensor* dx);
 template <typename T, typename Context>
 void ReluDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& out,
@@ -59,34 +84,29 @@ void TanhTripleGradKernel(const Context& dev_ctx,
                          DenseTensor* d_ddx);
 template <typename T, typename Context>
-void BReluGradKernel(const Context& dev_ctx,
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
-                     const DenseTensor& dout,
+                               const DenseTensor& ddx,
-                     float t_min,
+                               float alpha,
-                     float t_max,
+                               DenseTensor* ddout);
-                     DenseTensor* dx);
 template <typename T, typename Context>
-void LeakyReluGradKernel(const Context& dev_ctx,
+void EluGradKernel(const Context& dev_ctx,
                   const DenseTensor& x,
+                   const DenseTensor& out,
                   const DenseTensor& dout,
                   float alpha,
                   DenseTensor* dx);
 template <typename T, typename Context>
-void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+void EluDoubleGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
+                         const DenseTensor& dout,
                         const DenseTensor& ddx,
                         float alpha,
+                         DenseTensor* dx,
                         DenseTensor* ddout);
-template <typename T, typename Context>
-void ThresholdedReluGradKernel(const Context& dev_ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& dout,
-                               float threshold,
-                               DenseTensor* dx);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
@@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
+    DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold)
+        DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda)
+            DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)
+                DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
 }  // namespace phi
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -24,6 +24,21 @@ namespace phi {
  void name##Kernel(                      \
      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
+  template <typename T, typename Context>                    \
+  void name##Kernel(const Context& dev_ctx,                  \
+                    const DenseTensor& x,                    \
+                    float attr,                              \
+                    DenseTensor* out);
+#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    float attr1,                                     \
+                    float attr2,                                     \
+                    DenseTensor* out);
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
 DECLARE_ACTIVATION_KERNEL(Acos)
@@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
-template <typename T, typename Context>
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
-void BReluKernel(const Context& dev_ctx,
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
-                 const DenseTensor& x,
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
-                 float t_min,
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
-                 float t_max,
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
-                 DenseTensor* out);
-template <typename T, typename Context>
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
-void LeakyReluKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float alpha,
-                     DenseTensor* out);
-template <typename T, typename Context>
-void ThresholdedReluKernel(const Context& dev_ctx,
-                           const DenseTensor& x,
-                           float threshold,
-                           DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -21,18 +21,18 @@ limitations under the License. */
 namespace phi {
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
  template <typename T, typename Context>                           \
  void name##GradKernel(const Context& dev_ctx,                     \
                        const DenseTensor& x,                       \
                        const DenseTensor& dout,                    \
                        DenseTensor* dx) {                          \
-    functor_class<T> functor;                                       \
+    funcs::functor_class<T> functor;                                \
-    ActivationGradImpl<T, Context, functor_class<T>>(               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(        \
        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(      \
    name, functor_class, attr)                               \
  template <typename T, typename Context>                    \
  void name##GradKernel(const Context& dev_ctx,              \
@@ -40,14 +40,14 @@ namespace phi {
                        const DenseTensor& dout,             \
                        float attr,                          \
                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                           \
+    funcs::functor_class<T> functor;                         \
    auto attrs = functor.GetAttrs();                         \
    *(attrs[0].second) = attr;                               \
-    ActivationGradImpl<T, Context, functor_class<T>>(   \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
        dev_ctx, &x, nullptr, &dout, dx, functor);           \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(      \
    name, functor_class, attr1, attr2)                       \
  template <typename T, typename Context>                    \
  void name##GradKernel(const Context& dev_ctx,              \
@@ -56,26 +56,26 @@ namespace phi {
                        float attr1,                         \
                        float attr2,                         \
                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                           \
+    funcs::functor_class<T> functor;                         \
    auto attrs = functor.GetAttrs();                         \
    *(attrs[0].second) = attr1;                              \
    *(attrs[1].second) = attr2;                              \
-    ActivationGradImpl<T, Context, functor_class<T>>(   \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
        dev_ctx, &x, nullptr, &dout, dx, functor);           \
  }
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
  template <typename T, typename Context>                             \
  void name##GradKernel(const Context& dev_ctx,                       \
                        const DenseTensor& out,                       \
                        const DenseTensor& dout,                      \
                        DenseTensor* dx) {                            \
-    functor_class<T> functor;                                         \
+    funcs::functor_class<T> functor;                                  \
-    ActivationGradImpl<T, Context, functor_class<T>>(                 \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(          \
        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
  }
-#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(    \
    name, functor_class, attr)                               \
  template <typename T, typename Context>                    \
  void name##GradKernel(const Context& dev_ctx,              \
@@ -83,39 +83,78 @@ namespace phi {
                        const DenseTensor& dout,             \
                        float attr,                          \
                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                             \
+    funcs::functor_class<T> functor;                         \
    auto attrs = functor.GetAttrs();                         \
    *(attrs[0].second) = attr;                               \
-    ActivationGradImpl<T, Context, functor_class<T>>(     \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
        dev_ctx, nullptr, &out, &dout, dx, functor);         \
  }
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
-                                               funcs::LeakyReluGradFunctor,
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               LeakyReluGradFunctor,
                                               alpha);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
-    ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold);
+                                               ThresholdedReluGradFunctor,
+                                               threshold);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
-                                               funcs::BReluGradFunctor,
+                                               SoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               HardShrinkGradFunctor,
+                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               BReluGradFunctor,
                                               t_min,
                                               t_max);
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
+  auto out_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
+  auto dx_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
+  auto* place = dev_ctx.eigen_device();
+  if (alpha > 0) {
+    funcs::ELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  } else {
+    funcs::ELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  }
+}
 }  // namespace phi
 PD_REGISTER_KERNEL(
@@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                          ReluDoubleGradKernel)
@@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
                                          TanhDoubleGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
                                          LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
 PD_REGISTER_KERNEL(tanh_triple_grad,
                   CPU,

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -23,8 +23,9 @@ namespace phi {
  template <typename T, typename Context>                               \
  void name##Kernel(                                                    \
      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
-    functor_class functor;                                               \
+    funcs::functor_class<T> functor;                                    \
-    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
  }
 #define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
@@ -33,10 +34,11 @@ namespace phi {
                    const DenseTensor& x,                               \
                    float attr,                                         \
                    DenseTensor* out) {                                 \
-    functor_class<T> functor;                                               \
+    funcs::functor_class<T> functor;                                    \
    auto attrs = functor.GetAttrs();                                    \
    *(attrs[0].second) = attr;                                          \
-    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
  }
 #define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(            \
@@ -47,50 +49,63 @@ namespace phi {
                    float attr1,                         \
                    float attr2,                         \
                    DenseTensor* out) {                  \
-    functor_class<T> functor;                                               \
+    funcs::functor_class<T> functor;                     \
    auto attrs = functor.GetAttrs();                     \
    *(attrs[0].second) = attr1;                          \
    *(attrs[1].second) = attr2;                          \
-    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+    ActivationImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                       \
  }
-DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha)
+DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
-                                     funcs::ThresholdedReluFunctor,
+                                     ThresholdedReluFunctor,
                                     threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
 #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
-  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {}
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
-PD_REGISTER_ACTIVATION_KERNEL(sin, Sin)
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cos, Cos)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(tan, Tan)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acos, Acos)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asin, Asin)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atan, Atan)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename T>
+static inline void ClipWithMask(const CPUContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners,
+                                std::string padding_mode,
+                                DenseTensor* grid_slice,
+                                DenseTensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->Resize(grid_slice->dims());
+  ctx.Alloc<T>(grid_scale);
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+template <typename T>
+static void CalcGridLocationsWithGrad(const CPUContext& ctx,
+                                      const DenseTensor& grid,
+                                      const int in_h,
+                                      const int in_w,
+                                      bool align_corners,
+                                      std::string padding_mode,
+                                      DenseTensor* grid_x,
+                                      DenseTensor* grid_y,
+                                      DenseTensor* grid_x_scale,
+                                      DenseTensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+  ClipWithMask<T>(
+      ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale);
+  ClipWithMask<T>(
+      ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale);
+}
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& d1,
+                                        const DenseTensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+template <typename T>
+static void GatherBilinearGrad(const CPUContext& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& output_grad,
+                               DenseTensor* grid_x,
+                               DenseTensor* grid_y,
+                               DenseTensor* grid_x_scale,
+                               DenseTensor* grid_y_scale,
+                               DenseTensor* input_grad,
+                               DenseTensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+  // gather output grad value to input grad by corner point coords and weight
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  if (grid_grad != nullptr) {
+    DenseTensor grid_grad_x, grid_grad_y;
+    grid_grad_x.Resize({n, out_h, out_w});
+    grid_grad_y.Resize({n, out_h, out_w});
+    ctx.Alloc<T>(&grid_grad_x);
+    ctx.Alloc<T>(&grid_grad_y);
+    auto grid_grad_x_t =
+        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+    auto grid_grad_y_t =
+        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            grid_grad_x_t(i, k, l) +=
+                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+            grid_grad_y_t(i, k, l) +=
+                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+    //  const T x_max = static_cast<T>(in_w - 1);
+    //  const T y_max = static_cast<T>(in_h - 1);
+    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+    // gather grid_grad [x, y] in 3rd Dim
+    T* grid_grad_data = grid_grad->data<T>();
+    T* grid_grad_x_data = grid_grad_x.data<T>();
+    T* grid_grad_y_data = grid_grad_y.data<T>();
+    for (int i = 0; i < n * out_h * out_w; i++) {
+      grid_grad_data[2 * i] = grid_grad_x_data[i];
+      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    }
+  }
+}
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  x_grad->Resize({n, c, in_h, in_w});
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    dev_ctx.template Alloc<T>(grid_grad);
+    phi::funcs::SetConstant<Context, T>()(
+        dev_ctx, grid_grad, static_cast<T>(0));
+  }
+  DenseTensor grid_x, grid_y;
+  DenseTensor grid_x_scale, grid_y_scale;
+  CalcGridLocationsWithGrad<T>(dev_ctx,
+                               grid,
+                               in_h,
+                               in_w,
+                               align_corners,
+                               padding_mode,
+                               &grid_x,
+                               &grid_y,
+                               &grid_x_scale,
+                               &grid_y_scale);
+  if (mode == "bilinear") {
+    GatherBilinearGrad<T>(dev_ctx,
+                          x,
+                          out_grid,
+                          &grid_x,
+                          &grid_y,
+                          &grid_x_scale,
+                          &grid_y_scale,
+                          x_grad,
+                          grid_grad);
+  } else {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+using Array4 = Eigen::DSizes<int64_t, 4>;
+template <typename T>
+static inline void Clip(const CPUContext& ctx,
+                        DenseTensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners,
+                        std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+template <typename T>
+static void CalcGridLocations(const CPUContext& ctx,
+                              const DenseTensor& grid,
+                              const int in_h,
+                              const int in_w,
+                              bool align_corners,
+                              std::string padding_mode,
+                              DenseTensor* grid_x,
+                              DenseTensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+template <typename T>
+static void BilinearInter(const CPUContext& ctx,
+                          const DenseTensor& input,
+                          DenseTensor* grid_x,
+                          DenseTensor* grid_y,
+                          DenseTensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+  dev_ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+  DenseTensor grid_x, grid_y;
+  CalcGridLocations<T>(
+      dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
+  if (mode == "bilinear") {
+    BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(x, out, grid_x, grid_y);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+template <typename T>
+void Unnormalize(const CPUContext& ctx,
+                 DenseTensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max) {
+    return false;
+  }
+  return true;
+}
+template <typename T>
+void GetGridPointValue(const DenseTensor& input,
+                       DenseTensor* output,
+                       const DenseTensor& x,
+                       const DenseTensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void AllNeigbors(const CPUContext& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* grid_x,
+                 DenseTensor* grid_y,
+                 DenseTensor* x_w,
+                 DenseTensor* x_e,
+                 DenseTensor* y_n,
+                 DenseTensor* y_s,  // positions
+                 DenseTensor* d_w,
+                 DenseTensor* d_e,
+                 DenseTensor* d_n,
+                 DenseTensor* d_s,  // distance
+                 DenseTensor* v_wn,
+                 DenseTensor* v_en,
+                 DenseTensor* v_ws,
+                 DenseTensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  x_w->Resize({n, out_h, out_w});
+  x_e->Resize({n, out_h, out_w});
+  y_n->Resize({n, out_h, out_w});
+  y_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(x_w);
+  ctx.Alloc<T>(x_e);
+  ctx.Alloc<T>(y_n);
+  ctx.Alloc<T>(y_s);
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+  // calculate distances to 4 sides
+  d_w->Resize({n, out_h, out_w});
+  d_e->Resize({n, out_h, out_w});
+  d_n->Resize({n, out_h, out_w});
+  d_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(d_w);
+  ctx.Alloc<T>(d_e);
+  ctx.Alloc<T>(d_n);
+  ctx.Alloc<T>(d_s);
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+  // calc 4 corner points value
+  v_wn->Resize({n, c, out_h, out_w});
+  v_en->Resize({n, c, out_h, out_w});
+  v_ws->Resize({n, c, out_h, out_w});
+  v_es->Resize({n, c, out_h, out_w});
+  ctx.Alloc<T>(v_wn);
+  ctx.Alloc<T>(v_en);
+  ctx.Alloc<T>(v_ws);
+  ctx.Alloc<T>(v_es);
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+namespace phi {
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  if (dim < 0) {
+    dim += out_grad.dims().size();
+  }
+  const auto& index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectGradInner<Context, T, int64_t>(
+        ctx, out_grad, index, x_grad, dim);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(index_select_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename Context, typename T, class Enable = void>
+struct IndexSelectAdd {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    for (int i = 0; i < slice_size; i++) {
+      dist_pointer[i] = src_pointer[i] + p_pointer[i];
+    }
+  }
+};
+template <typename Context, typename T>
+struct IndexSelectAdd<
+    Context,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
+  }
+};
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectInner(const Context& ctx,
+                      DenseTensor* input,
+                      const DenseTensor& index,
+                      DenseTensor* output,
+                      int dim) {
+  auto input_dim = input->dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = output->dims();
+  auto index_size = index.dims()[0];
+  DenseTensor index_cpu_copy;
+  if (!paddle::platform::is_cpu_place(index.place())) {
+    phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
+  }
+  const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
+                                 ? index.data<IndexT>()
+                                 : index_cpu_copy.data<IndexT>();
+  ctx.template Alloc<T>(output);
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+    PADDLE_ENFORCE_LT(
+        index_data[i],
+        input_dim[dim],
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+  }
+  VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; index_size: " << index_size;
+  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+  auto input_tensor = EigenTensor<T, 3>::From(*input);
+  auto output_tensor = EigenTensor<T, 3>::From(*output);
+  auto& place = *ctx.eigen_device();
+  for (auto j = 0; j < index_size; j++) {
+    IndexT index_value = index_data[j];
+    auto output_t = output_tensor.chip(j, 1);
+    output_t.device(place) = input_tensor.chip(index_value, 1);
+  }
+  input->Resize(input_dim);
+  output->Resize(output_dim);
+}
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectGradInner(const Context& ctx,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad,
+                          int dim) {
+  const T* input_data = out_grad.data<T>();
+  const IndexT* index_data = index.data<IndexT>();
+  const T* p_output = ctx.template Alloc<T>(x_grad);
+  T* out_data = ctx.template Alloc<T>(x_grad);
+  auto input_dim = out_grad.dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = x_grad->dims();
+  phi::funcs::SetConstant<Context, T> set_constant;
+  set_constant(ctx, x_grad, static_cast<T>(0.0));
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+  auto input_width = slice_size * input_dim[dim];
+  auto output_width = slice_size * output_dim[dim];
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+  auto index_size = index.dims()[0];
+  VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; input_width: " << input_width
+          << "; output_width: " << output_width
+          << "; index_size: " << index_size;
+  for (auto i = 0; i < outer_nums; i++) {
+    auto input_start_offset = i * input_width;
+    auto output_start_offset = i * output_width;
+    for (auto j = 0; j < index_size; j++) {
+      IndexT index_value = index_data[j];
+      auto src = input_data + input_start_offset + j * slice_size;
+      auto p_out = p_output + output_start_offset + index_value * slice_size;
+      auto dst = out_data + output_start_offset + index_value * slice_size;
+      IndexSelectAdd<Context, T> index_select_add;
+      index_select_add(ctx, slice_size, src, p_out, dst);
+    }
+  }
+  x_grad->Resize(output_dim);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
@@ -12,32 +12,50 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/index_select_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/core/utils/data_type.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
 namespace phi {
 template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
+void IndexSelectKernel(const Context& ctx,
                       const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
+                       const DenseTensor& index,
-                      bool keep_dim,
+                       int dim,
-                      bool reduce_all,
+                       DenseTensor* output) {
-                      DenseTensor* out) {
+  auto inputs = x;
-  auto out_dtype = x.dtype();
+  if (dim < 0) {
-  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+    dim += inputs.dims().size();
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  }
+  const auto& index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
+  }
 }
 }  // namespace phi
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(index_select,
                   CPU,
                   ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::IndexSelectKernel,
                   float,
                   double,
                   int,

--- a/paddle/phi/kernels/cpu/lgamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 #include "paddle/phi/kernels/lgamma_kernel.h"
+#include <unsupported/Eigen/SpecialFunctions>
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/for_range.h"

--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -19,10 +19,8 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
@@ -55,30 +53,6 @@ namespace phi {
    }                                                                       \
  }
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
 template <typename T, typename Context>
 void DivideRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -164,20 +138,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                   complex64,
                   complex128,
                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-PD_REGISTER_KERNEL(
-    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -23,7 +23,7 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
 #include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 namespace phi {

--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  auto* index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T));
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(multiplex_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc
@@ -12,28 +12,54 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_min_kernel.h"
+#include "paddle/phi/kernels/multiplex_kernel.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
 namespace phi {
 template <typename T, typename Context>
-void MinRawKernel(const Context& dev_ctx,
+void MultiplexKernel(const Context& ctx,
-                  const DenseTensor& x,
+                     const std::vector<const DenseTensor*>& ins,
-                  const std::vector<int64_t>& dims,
+                     const DenseTensor& ids,
-                  bool keep_dim,
-                  bool reduce_all,
                     DenseTensor* out) {
-  auto out_dtype = x.dtype();
+  ctx.template Alloc<T>(out);
-  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+  for (size_t i = 0; i < ins.size(); ++i) {
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  auto index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T));
+  }
 }
 }  // namespace phi
-PD_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(multiplex,
-    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/qr_kernel.cc
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Eigen/Dense>
+#include "paddle/phi/kernels/qr_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+namespace phi {
+static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "QR received unrecognized mode '%s'"
+        " but expected one of 'reduced' (default), 'r', or 'complete'",
+        mode));
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r) {
+  bool compute_q;
+  bool reduced_mode;
+  std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
+  auto numel = x.numel();
+  PADDLE_ENFORCE_GT(
+      numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
+  auto x_dims = x.dims();
+  int x_rank = x_dims.size();
+  int m = x_dims[x_rank - 2];
+  int n = x_dims[x_rank - 1];
+  int min_mn = std::min(m, n);
+  int k = reduced_mode ? min_mn : m;
+  int batch_size = numel / (m * n);
+  int x_stride = m * n;
+  int q_stride = m * k;
+  int r_stride = k * n;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  T* q_data = nullptr;
+  if (compute_q) {
+    q_data = ctx.template Alloc<phi::dtype::Real<T>>(
+        q, batch_size * m * k * sizeof(phi::dtype::Real<T>));
+  }
+  auto* r_data = ctx.template Alloc<phi::dtype::Real<T>>(
+      r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
+  // Implement QR by calling Eigen
+  for (int i = 0; i < batch_size; ++i) {
+    const T* x_matrix_ptr = x_data + i * x_stride;
+    T* r_matrix_ptr = r_data + i * r_stride;
+    using EigenDynamicMatrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
+    Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
+    if (reduced_mode) {
+      auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
+      auto r_matrix_view =
+          qr_top_matrix.template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    } else {
+      auto r_matrix_view =
+          qr.matrixQR().template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    }
+    if (compute_q) {
+      T* q_matrix_ptr = q_data + i * q_stride;
+      if (reduced_mode) {
+        auto q_matrix =
+            qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      } else {
+        auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      }
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/reduce_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+namespace phi {
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+}  // namespace phi
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+PD_REGISTER_KERNEL(sum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
+PD_REGISTER_KERNEL(prod_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <class T>
+void bilinear_interpolate_gradient(const int height,
+                                   const int width,
+                                   T y,
+                                   T x,
+                                   const T out_grad_this_bin,
+                                   const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  auto in_dims = x.dims();
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+  if (!dx) {
+    return;
+  }
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = roi_batch_id_list.data<int>();
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+  dev_ctx.template Alloc<T>(dx);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+  int output_grad_size = out_grad.numel();
+  if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
+    return;
+  }
+  const T* boxes_data = boxes.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+  auto in_stride = phi::stride(x.dims());
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out_grad.dims());
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_idx = box_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    roi_width = std::max(roi_width, static_cast<T>(1.));
+    roi_height = std::max(roi_height, static_cast<T>(1.));
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    for (int c = 0; c < channels; ++c) {
+      T* batch_grad_data =
+          dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
+      const T* batch_out_grad_data =
+          out_grad_data + n * out_stride[0] + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int pool_index = ph * pooled_width + pw;
+          T out_grad_this_bin = batch_out_grad_data[pool_index];
+          int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+          int roi_bin_grid_w = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_width / pooled_width);
+          T count = roi_bin_grid_h * roi_bin_grid_w;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_ymin + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_xmin + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              bilinear_interpolate_gradient(height,
+                                            width,
+                                            y,
+                                            x,
+                                            out_grad_this_bin,
+                                            count,
+                                            batch_grad_data);
+            }
+          }
+        }
+      }
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(roi_align_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiAlignGradKernel,
+                   float,
+                   double,
+                   int) {}
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -179,7 +179,7 @@ void AvgPool(const std::vector<T>& interpolated_values,
 }
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,
@@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 PD_REGISTER_KERNEL(
-    roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {}
+    roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
--- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
@@ -12,28 +12,53 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/roll_grad_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
 namespace phi {
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void RollGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
+                    const DenseTensor& out_grad,
-                  bool keep_dim,
+                    const ScalarArray& shifts,
-                  bool reduce_all,
+                    const std::vector<int64_t>& axis,
-                  DenseTensor* out) {
+                    DenseTensor* x_grad) {
-  auto out_dtype = x.dtype();
+  std::vector<T> out_vec;
-  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+  paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = out_grad.dims();
+  auto dims = axis;
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+  for (size_t i = 0; i < nums; i++) {
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
+  x_grad->Resize(out_grad.dims());
 }
 }  // namespace phi
-PD_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(roll_grad,
-    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/roll_kernel.cc
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/roll_kernel.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+namespace phi {
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = x.dims();
+  auto dims = axis;
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+  for (size_t i = 0; i < nums; i++) {
+    PADDLE_ENFORCE_EQ(
+        dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(axis[%d]) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
+            i,
+            input_dim.size(),
+            input_dim.size() - 1,
+            i,
+            dims[i]));
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(out);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
+  out->Resize(x.dims());
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(roll,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,21 +13,16 @@
 // limitations under the License.
 #pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
+#include "paddle/phi/common/scalar_array.h"
-namespace operators {
+#include "paddle/phi/core/dense_tensor.h"
-using Tensor = framework::Tensor;
+namespace phi {
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
 template <typename T>
-inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
+inline void ShiftAlongDim(T* data,
+                          const DDim& input_dim,
+                          int64_t dim,
                          int64_t shift) {
  if (dim < 0) {
    dim += input_dim.size();
@@ -78,92 +73,4 @@ inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
  }
 }
-template <typename DeviceContext, typename T>
+}  // namespace phi
-class RollKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar("X");
-    auto* output_var = context.OutputVar("Out");
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-    for (size_t i = 0; i < nums; i++) {
-      PADDLE_ENFORCE_EQ(
-          dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(axis[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
-              i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
-      shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-template <typename DeviceContext, typename T>
-class RollGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_var = context.OutputVar(framework::GradVarName("X"));
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-    for (size_t i = 0; i < nums; i++) {
-      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/kernels/cpu/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc
@@ -12,26 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_any_kernel.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-namespace phi {
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   CPU,
-template <typename T, typename Context>
+                   ALL_LAYOUT,
-void AnyRawKernel(const Context& dev_ctx,
+                   phi::TrilTriuGradKernel,
-                  const DenseTensor& x,
+                   bool,
-                  const std::vector<int64_t>& dims,
+                   float,
-                  bool keep_dim,
+                   double,
-                  bool reduce_all,
+                   int,
-                  DenseTensor* out) {
+                   int64_t,
-  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+                   phi::dtype::float16) {}
-      dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
--- a/paddle/phi/kernels/cpu/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc
@@ -12,26 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_all_kernel.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-namespace phi {
+PD_REGISTER_KERNEL(tril_triu,
+                   CPU,
-template <typename T, typename Context>
+                   ALL_LAYOUT,
-void AllRawKernel(const Context& dev_ctx,
+                   phi::TrilTriuKernel,
-                  const DenseTensor& x,
+                   bool,
-                  const std::vector<int64_t>& dims,
+                   float,
-                  bool keep_dim,
+                   double,
-                  bool reduce_all,
+                   int,
-                  DenseTensor* out) {
+                   int64_t,
-  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+                   phi::dtype::float16) {}
-      dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -29,11 +29,17 @@
 #include <type_traits>
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#ifdef PADDLE_WITH_XPU_KP
+#define __forceinline__ __inline__
+#endif
 namespace phi {
 namespace funcs {
@@ -776,6 +782,236 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
+  }
+};
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
+  }
+};
+template <typename T>
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
+template <typename T>
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
+  }
+};
+template <typename T>
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        (x < static_cast<T>(0))
+            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
+  }
+};
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 1: alpha >= 0
+    // dx = dout, if out > 0
+    // dx = dout * (out + alpha), if out <= 0
+    dx.device(d) = (out > static_cast<T>(0))
+                       .select(dout, dout * (out + static_cast<T>(alpha)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 2: alpha < 0
+    // dx = dout, if x > 0
+    // dx = dout * (out + alpha), if x <=0
+    dx.device(d) = (x > static_cast<T>(0))
+                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
+      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
+                      (x <= static_cast<T>(0)).template cast<T>();
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
+      ddout.device(*d) = ddx *
+                         ((x > static_cast<T>(0)).template cast<T>() +
+                          static_cast<T>(alpha) * x.exp() *
+                              (x <= static_cast<T>(0)).template cast<T>())
+                             .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -1214,6 +1450,209 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  __device__ __forceinline__ T operator()(const T x) const {
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  // dx = dout, if x > lambda or x < -lambda else 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : dout;
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  // tanhshrink(x) = x - tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x - tanh(x));
+  }
+};
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  // dx = dout * tanh(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * tanh(x) * tanh(x));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : dout;
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename phi::dtype::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  // elu(x) = x, if x > 0
+  // elu(x) = alpha * (e^x - 1), if x <= 0
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    CT x = static_cast<CT>(arg_x);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = x > zero ? x : temp;
+    return static_cast<T>(res);
+  }
+};
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  // case 1: alpha >= 0
+  // dx = dout, if out > 0
+  // dx = dout * (out + alpha), if out <= 0
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType a = static_cast<MPType>(alpha);
+    MPType out_pos = static_cast<MPType>(out > zero);
+    MPType out_neg = static_cast<MPType>(out <= zero);
+    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+template <typename T>
+struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  // case 2: alpha < 0
+  // dx = dout, if x > 0
+  // dx = dout * (out + alpha), if x <=0
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(alpha);
+    MPType x_pos = static_cast<MPType>(x > zero);
+    MPType x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // silu(x) = x / (1 + exp(-x))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x / (one + exp(-x)));
+  }
+};
+template <typename T>
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
 #endif
 }  // namespace funcs

--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint(
 }
 inline void GetOutShape(const DDim& x_dims,
-                        const DDim& kernel_dims,
+                        const std::vector<int>& kernel_sizes,
                        const std::vector<int>& paddings,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
@@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims,
      x_dims.size(),
      5,
      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
-  PADDLE_ENFORCE_EQ(kernel_dims.size(),
+  PADDLE_ENFORCE_EQ(kernel_sizes.size(),
                    5,
                    phi::errors::InvalidArgument(
                        "the shape of kernel should be (D, H, W, C, OC)"));
  // infer out shape
  (*out_dims)[0] = x_dims[0];
-  (*out_dims)[4] = kernel_dims[4];
+  (*out_dims)[4] = kernel_sizes[4];
  for (int i = 1; i < 4; i++) {
    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
-                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
+                      dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
                         strides[i - 1] +
                     1;
  }
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 inline void SubmPreProcess(const Context& dev_ctx,
                           const SparseCooTensor& x,
                           const DenseTensor& kernel,
-                           const SparseCooTensor& out_grad,
+                           const DenseTensor& out_grad,
                           const int in_channels,
                           const int out_channels,
                           const int half_kernel_size,
@@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx,
  blas.GEMM(CblasTrans,
            CblasNoTrans,
            x.non_zero_elements().dims()[1],
-            out_grad.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
            x.non_zero_elements().dims()[0],
            static_cast<T>(1),
            x.non_zero_elements().data<T>(),
-            out_grad.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
            static_cast<T>(0),
            d_kernel_ptr + half_kernel_size * in_channels * out_channels);
@@ -155,11 +155,11 @@ inline void SubmPreProcess(const Context& dev_ctx,
  T* x_grad_ptr = x_grad->data<T>();
  blas.GEMM(CblasNoTrans,
            CblasTrans,
-            out_grad.non_zero_elements().dims()[0],
+            out_grad.dims()[0],
            in_channels,
-            out_grad.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
            static_cast<T>(1),
-            out_grad.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
            static_cast<T>(0),
            x_grad_ptr);

--- a/paddle/phi/kernels/funcs/tril_triu_compute.h
+++ b/paddle/phi/kernels/funcs/tril_triu_compute.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+namespace phi {
+namespace funcs {
+template <typename T>
+class TrilTriuCompute {
+ public:
+  HOSTDEVICE TrilTriuCompute(const T* in,
+                             const int diagonal,
+                             const bool lower,
+                             const int64_t H,
+                             const int64_t W,
+                             T* out)
+      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
+  HOSTDEVICE void operator()(int64_t idx) {
+    const int64_t row = (idx / W_) % H_;
+    const int64_t col = idx % W_;
+    const bool mask =
+        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
+    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
+  }
+ private:
+  const T* in_;
+  const int diagonal_;
+  const bool lower_;
+  const int64_t H_;
+  const int64_t W_;
+  T* out_;
+};
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
  }
 }
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
  template <typename T, typename Context>                           \
  void name##GradKernel(const Context& dev_ctx,                     \
                        const DenseTensor& x,                       \
@@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
  }
-#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(         \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(         \
    name, functor_class, attr)                                  \
  template <typename T, typename Context>                       \
  void name##GradKernel(const Context& dev_ctx,                 \
@@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, &x, nullptr, &dout, dx, functor);              \
  }
-#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(         \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
    name, functor_class, attr1, attr2)                          \
  template <typename T, typename Context>                       \
  void name##GradKernel(const Context& dev_ctx,                 \
@@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, &x, nullptr, &dout, dx, functor);              \
  }
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
  template <typename T, typename Context>                             \
  void name##GradKernel(const Context& dev_ctx,                       \
                        const DenseTensor& out,                       \
@@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
  }
-#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(       \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(       \
    name, functor_class, attr)                                  \
  template <typename T, typename Context>                       \
  void name##GradKernel(const Context& dev_ctx,                 \
@@ -142,32 +142,62 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, nullptr, &out, &dout, dx, functor);            \
  }
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                               CudaLeakyReluGradFunctor,
                                               alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu,
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
                                               CudaThresholdedReluGradFunctor,
                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               CudaSoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               CudaHardShrinkGradFunctor,
+                                               threshold);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               CudaBReluGradFunctor,
                                               t_min,
                                               t_max);
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  std::vector<const DenseTensor*> ins = {&dout, &out};
+  std::vector<DenseTensor*> outs = {dx};
+  if (alpha > 0) {
+    funcs::CudaELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::CudaELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    ins.push_back(&x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
 }  // namespace phi
 #ifdef PADDLE_WITH_HIP
@@ -234,3 +264,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
                                   LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -42,8 +42,9 @@ void ActivationGPUImpl(const Context& dev_ctx,
  template <typename T, typename Context>                               \
  void name##Kernel(                                                    \
      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
-    functor_class functor;                                                  \
+    funcs::functor_class<T> functor;                                    \
-    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
  }
 #define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
@@ -75,24 +76,31 @@ void ActivationGPUImpl(const Context& dev_ctx,
        dev_ctx, x, out, functor);                          \
  }
-DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     CudaThresholdedReluFunctor,
                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
+                                     CudaHardShrinkFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
@@ -142,3 +150,8 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+namespace phi {
+template <typename T>
+static __forceinline__ __device__ void AtomicAdd(
+    T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
+  if (InBounds(h, w, H, W)) {
+    paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        int clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         int size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return coord;
+}
+template <typename T>
+__global__ void GridSamplerCudaBackwardKernel(const int nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              int n,
+                                              int out_c,
+                                              int out_h,
+                                              int out_w,
+                                              int in_h,
+                                              int in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c,
+               inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC,
+               gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+  T* grid_grad_data = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+  }
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSamplerCudaBackwardKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      out_grad.data<T>(),
+      x.data<T>(),
+      grid.data<T>(),
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x_grad->data<T>(),
+      grid_grad_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+namespace phi {
+template <typename T>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                int size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   int twice_low,
+                                                   int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+template <typename T>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     int size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = ReflectIndexes(coord, -1, 2 * size - 1);
+    }
+    coord = ClipIndexes(coord, size - 1);
+  }
+  return coord;
+}
+template <typename T>
+__global__ void GridSampleCudaKernel(const int nthreads,
+                                     int n,
+                                     int out_c,
+                                     int out_h,
+                                     int out_w,
+                                     int in_h,
+                                     int in_w,
+                                     const T* input,
+                                     const T* grid,
+                                     T* output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+          << "; out_w: " << out_w;
+  auto* output_data = dev_ctx.template Alloc<T>(out);
+  VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+          << out->dims()[2] << "; " << out->dims()[3];
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSampleCudaKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x.data<T>(),
+      grid.data<T>(),
+      output_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+namespace phi {
+enum class Mode {
+  bilinear,
+  nearest,
+};
+enum class PaddingMode { zeros, border, reflect };
+static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename IndexT>
+__global__ void index_select_grad_cuda_kernel(const T* output_grad,
+                                              T* input_grad,
+                                              const IndexT* index,
+                                              int64_t nums,
+                                              int64_t N,
+                                              int64_t stride,
+                                              int64_t size,
+                                              int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+}
+template <typename T>
+__global__ void index_select_grad_init(T* input_grad, int64_t N) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  input_grad[idx] = 0.0;
+}
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  auto* output_grad_data = out_grad.data<T>();
+  auto* in_grad_data = ctx.template Alloc<T>(x_grad);
+  auto input_dim = x_grad->dims();
+  auto output_dim = out_grad.dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  int64_t numel = x_grad->numel();
+  int64_t index_nums = index.numel();
+  int64_t out_nums = out_grad.numel();
+  auto stream = ctx.stream();
+  index_select_grad_init<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(in_grad_data, numel);
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_grad_cuda_kernel<T, int64_t><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_grad_cuda_kernel<T, int><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(index_select_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/index_select_kernel.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename IndexT>
+__global__ void index_select_cuda_kernel(const T* input,
+                                         T* output,
+                                         const IndexT* index,
+                                         int64_t N,
+                                         int64_t stride,
+                                         int64_t size,
+                                         int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  output[idx] = input[input_idx];
+}
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto input_dim = x.dims();
+  auto output_dim = output->dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  auto* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(output);
+  int64_t numel = output->numel();
+  auto stream = ctx.stream();
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_cuda_kernel<T, int64_t><<<
+        (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_cuda_kernel<
+        T,
+        int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS,
+               0,
+               stream>>>(
+        in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(index_select,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -56,30 +56,6 @@ namespace phi {
 * Kernels
 */
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
 // Create the definition of Add
 DEFINE_CUDA_ELEMENTWISE_OP(Add)
 // Create the definition of Subtract
@@ -147,30 +123,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                   complex64,
                   complex128,
                   bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-PD_REGISTER_KERNEL(mean_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanRawKernel,
-                   float,
-                   double,
-                   bool,
-                   float16,
-                   int,
-                   int64_t) {}
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -28,7 +28,7 @@
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
 #include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 namespace phi {

--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T),
+                           stream);
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(multiplex_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/multiplex_kernel.h"
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T),
+                         stream);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(multiplex,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/reduce_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_all_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/reduce_all_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-namespace phi {
-template <typename T, typename Context>
-void AllRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+}  // namespace phi
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+PD_REGISTER_KERNEL(sum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(prod_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"

--- a/paddle/phi/kernels/gpu/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/reduce_min_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-namespace phi {
-template <typename T, typename Context>
-void MinRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(
-    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
-Licensed under the Apache License, Version 2.0 (the "License");
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
+namespace phi {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
@@ -34,10 +36,18 @@ static inline int NumBlocks(const int N) {
 }
 template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
+__device__ void BilinearInterpolateGradient(const int height,
-                                            T y, T x, T* w1, T* w2, T* w3,
+                                            const int width,
-                                            T* w4, int* x_low, int* x_high,
+                                            T y,
-                                            int* y_low, int* y_high) {
+                                            T x,
+                                            T* w1,
+                                            T* w2,
+                                            T* w3,
+                                            T* w4,
+                                            int* x_low,
+                                            int* x_high,
+                                            int* y_low,
+                                            int* y_high) {
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    return;
  }
@@ -66,12 +76,20 @@ __device__ void BilinearInterpolateGradient(const int height, const int width,
 }
 template <typename T>
-__global__ void GPUROIAlignBackward(
+__global__ void GPURoiAlignBackward(const int nthreads,
-    const int nthreads, const T* input_rois, const T* out_grad,
+                                    const T* input_rois,
-    const int num_rois, const float spatial_scale, const int channels,
+                                    const T* out_grad,
-    const int height, const int width, const int pooled_height,
+                                    const int num_rois,
-    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
+                                    const float spatial_scale,
-    T* input_grad, const bool continuous_coordinate) {
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data,
+                                    T* input_grad,
+                                    const bool continuous_coordinate) {
  CUDA_KERNEL_LOOP(i, nthreads) {
    int pw = i % pooled_width;
    int ph = (i / pooled_width) % pooled_height;
@@ -119,109 +137,124 @@ __global__ void GPUROIAlignBackward(
                        static_cast<T>(roi_bin_grid_w);
        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
+        BilinearInterpolateGradient(height,
-                                    &x_low, &x_high, &y_low, &y_high);
+                                    width,
+                                    y,
+                                    x,
+                                    &w1,
+                                    &w2,
+                                    &w3,
+                                    &w4,
+                                    &x_low,
+                                    &x_high,
+                                    &y_low,
+                                    &y_high);
        T diff1 = out_grad_this_bin * w1 / count;
        T diff2 = out_grad_this_bin * w2 / count;
        T diff3 = out_grad_this_bin * w3 / count;
        T diff4 = out_grad_this_bin * w4 / count;
        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
+          paddle::platform::CudaAtomicAdd(
-                                  diff1);
+              offset_input_grad + y_low * width + x_low, diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
+          paddle::platform::CudaAtomicAdd(
-                                  diff2);
+              offset_input_grad + y_low * width + x_high, diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
+          paddle::platform::CudaAtomicAdd(
-                                  diff3);
+              offset_input_grad + y_high * width + x_low, diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
+          paddle::platform::CudaAtomicAdd(
-                                  diff4);
+              offset_input_grad + y_high * width + x_high, diff4);
        }
      }
    }
  }
 }
-template <typename Place, typename T>
+template <typename T, typename Context>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
+void RoiAlignGradKernel(const Context& dev_ctx,
- public:
+                        const DenseTensor& x,
-  void Compute(const framework::ExecutionContext& ctx) const override {
+                        const DenseTensor& boxes,
-    auto* in = ctx.Input<Tensor>("X");
+                        paddle::optional<const DenseTensor&> boxes_num,
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
+                        const DenseTensor& out_grad,
+                        int pooled_height,
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+                        int pooled_width,
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  int rois_num = boxes.dims()[0];
+  int channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
-    auto pooled_height = ctx.Attr<int>("pooled_height");
+  if (!dx) {
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-    if (!in_grad) {
    return;
  }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
+  DenseTensor box_batch_id_list;
-    auto cplace = platform::CPUPlace();
+  box_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+  int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
-    auto& dev_ctx = ctx.cuda_device_context();
+  auto cplace = phi::CPUPlace();
-    auto gplace = ctx.GetPlace();
+  auto gplace = dev_ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
+  if (boxes_num) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+    int boxes_batch_size = boxes_num->numel();
-      int rois_batch_size = rois_num_t->numel();
+    std::vector<int> boxes_num_list(boxes_batch_size);
-      std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(cplace,
-      memory::Copy(cplace, rois_num_list.data(), gplace,
+                         boxes_num_list.data(),
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
    int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
+    for (int n = 0; n < boxes_batch_size; ++n) {
-        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
+      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
+        box_batch_size[i] = n;
      }
-        start += rois_num_list[n];
+      start += boxes_num_list[n];
    }
  } else {
-      auto rois_lod = rois->lod().back();
+    auto boxes_lod = boxes.lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
+    int boxes_batch_size = boxes_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
+    for (int n = 0; n < boxes_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
+        box_batch_size[i] = n;
      }
    }
  }
  auto roi_ptr =
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
+      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
+  int bytes = box_batch_id_list.numel() * sizeof(int);
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+  paddle::memory::Copy(
-                 dev_ctx.stream());
+      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
-    in_grad->mutable_data<T>(ctx.GetPlace());
+  dev_ctx.template Alloc<T>(dx);
-    phi::funcs::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
-    int output_grad_size = out_grad->numel();
+  int output_grad_size = out_grad.numel();
  int blocks = NumBlocks(output_grad_size);
  int threads = kNumCUDAThreads;
  if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
+        output_grad_size,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
+        boxes.data<T>(),
-          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
+        out_grad.data<T>(),
+        rois_num,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        roi_id_data,
+        dx->data<T>(),
        aligned);
  }
-  }
+}
-};
-}  // namespace operators
+}  // namespace phi
-}  // namespace paddle
-namespace ops = paddle::operators;
+PD_REGISTER_KERNEL(
-REGISTER_OP_CUDA_KERNEL(
+    roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -71,7 +71,7 @@ __device__ T BilinearInterpolate(
 }
 template <class T>
-__global__ void GPUROIAlignForward(const int nthreads,
+__global__ void GPURoiAlignForward(const int nthreads,
                                   const T* input_data,
                                   const T* input_rois,
                                   const float spatial_scale,
@@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads,
 }
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,
@@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx,
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
  paddle::memory::Copy(
      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
-  GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
      output_size,
      x.data<T>(),
      boxes.data<T>(),
@@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 PD_REGISTER_KERNEL(
-    roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {}
+    roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  auto* in_data = out_grad.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(x_grad);
+  int64_t numel = out_grad.numel();
+  auto stream = dev_ctx.stream();
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = out_grad.dims();
+  auto stride_dim = phi::stride(input_dim);
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(roll_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/roll_kernel.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto stream = dev_ctx.stream();
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = x.dims();
+  auto stride_dim = phi::stride(input_dim);
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = (shifts_data[i] % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(roll,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, size_t Rank>
+__global__ void RollCudaKernel(const T* input,
+                               T* output,
+                               int64_t N,
+                               phi::Array<int64_t, Rank> shifts,
+                               phi::Array<int64_t, Rank> strides,
+                               phi::Array<int64_t, Rank> sizes) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  int64_t output_idx = idx;
+  int64_t new_dim_idx = 0;
+#pragma unroll
+  for (size_t i = 0; i < Rank; i++) {
+    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    if (new_dim_idx >= sizes[i]) {
+      output_idx += (shifts[i] - sizes[i]) * strides[i];
+    } else {
+      output_idx += shifts[i] * strides[i];
+    }
+  }
+  output[output_idx] = input[idx];
+}
+#define CALL_ROLL_CUDA_KERNEL(N)                                              \
+  case N: {                                                                   \
+    phi::Array<int64_t, N> _strides;                                          \
+    phi::Array<int64_t, N> _shifts;                                           \
+    phi::Array<int64_t, N> _sizes;                                            \
+    for (size_t idx = 0; idx < N; ++idx) {                                    \
+      _strides[idx] = strides[idx];                                           \
+      _shifts[idx] = shifts_data[idx];                                        \
+      _sizes[idx] = sizes[idx];                                               \
+    }                                                                         \
+    RollCudaKernel<                                                           \
+        T,                                                                    \
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+             PADDLE_CUDA_NUM_THREADS,                                         \
+             0,                                                               \
+             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                    \
+  }
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -12,32 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-namespace phi {
+PD_REGISTER_KERNEL(tril_triu_grad,
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(reduce_prod,
                   GPU,
                   ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::TrilTriuGradKernel,
+                   bool,
                   float,
                   double,
                   int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
@@ -12,25 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_any_kernel.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-namespace phi {
+PD_REGISTER_KERNEL(tril_triu,
+                   GPU,
-template <typename T, typename Context>
+                   ALL_LAYOUT,
-void AnyRawKernel(const Context& dev_ctx,
+                   phi::TrilTriuKernel,
-                  const DenseTensor& x,
+                   bool,
-                  const std::vector<int64_t>& dims,
+                   float,
-                  bool keep_dim,
+                   double,
-                  bool reduce_all,
+                   int,
-                  DenseTensor* out) {
+                   int64_t,
-  auto out_dtype = x.dtype();
+                   phi::dtype::float16) {}
-  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
--- a/paddle/phi/kernels/grid_sample_grad_kernel.h
+++ b/paddle/phi/kernels/grid_sample_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context &dev_ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &grid,
+                          const DenseTensor &out_grid,
+                          const std::string &mode,
+                          const std::string &padding_mode,
+                          bool align_corners,
+                          DenseTensor *x_grad,
+                          DenseTensor *grid_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_max_kernel.h
+++ b/paddle/phi/kernels/reduce_max_kernel.h
@@ -14,22 +14,19 @@
 #pragma once
+#include <string>
 #include "paddle/phi/core/dense_tensor.h"
 namespace phi {
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void GridSampleKernel(const Context &dev_ctx,
-                  const DenseTensor& x,
+                      const DenseTensor &x,
-                  const std::vector<int64_t>& dims,
+                      const DenseTensor &grid,
-                  bool keep_dim,
+                      const std::string &mode,
-                  bool reduce_all,
+                      const std::string &padding_mode,
-                  DenseTensor* out);
+                      bool align_corners,
+                      DenseTensor *out);
-template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -202,4 +202,24 @@ void TanhTripleGradKernel(const Context& dev_ctx,
          d_ddx);  // output
 }
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::ELUGradGradFunctor<T> functor;
+  functor.alpha = alpha;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
 }  // namespace phi
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -24,13 +24,12 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
 namespace phi {
 template <typename T, typename Context>
@@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx,
  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
-  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+  phi::funcs::TrilTriuCompute<T> tril_triu_functor(
      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
  y_for_range(tril_triu_functor);

--- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
+#include <unsupported/Eigen/SpecialFunctions>
 #include "paddle/phi/kernels/funcs/for_range.h"
 namespace phi {
 template <typename T>

--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -21,12 +21,11 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
 namespace phi {
 template <typename T, typename Context>
@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
    const auto H = dims[dims.size() - 2];
    const auto W = dims[dims.size() - 1];
    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
-    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+    phi::funcs::TrilTriuCompute<T> tril_triu_functor(
        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
    x_for_range(tril_triu_functor);

--- a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+namespace phi {
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad) {
+  const auto* dout_data = out_grad.data<T>();
+  auto* dx_data = ctx.template Alloc<T>(x_grad);
+  const auto& dims = out_grad.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+  phi::funcs::ForRange<Context> for_range(
+      ctx, static_cast<size_t>(out_grad.numel()));
+  phi::funcs::TrilTriuCompute<T> tril_triu_grad_computer(
+      dout_data, diagonal, lower, H, W, dx_data);
+  for_range(tril_triu_grad_computer);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/tril_triu_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+namespace phi {
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out) {
+  const auto* x_data = x.data<T>();
+  auto* out_data = ctx.template Alloc<T>(out);
+  const auto& dims = x.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+  phi::funcs::ForRange<Context> for_range(ctx, static_cast<size_t>(x.numel()));
+  phi::funcs::TrilTriuCompute<T> tril_triu_computer(
+      x_data, diagonal, lower, H, W, out_data);
+  for_range(tril_triu_computer);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_all_kernel.h
+++ b/paddle/phi/kernels/reduce_all_kernel.h
@@ -19,17 +19,11 @@
 namespace phi {
 template <typename T, typename Context>
-void AllRawKernel(const Context& dev_ctx,
+void IndexSelectGradKernel(const Context& ctx,
                           const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
+                           const DenseTensor& index,
-                  bool keep_dim,
+                           const DenseTensor& out_grad,
-                  bool reduce_all,
+                           int dim,
-                  DenseTensor* out);
+                           DenseTensor* x_grad);
-template <typename T, typename Context>
-void AllKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/reduce_any_kernel.h
+++ b/paddle/phi/kernels/reduce_any_kernel.h
@@ -19,17 +19,10 @@
 namespace phi {
 template <typename T, typename Context>
-void AnyRawKernel(const Context& dev_ctx,
+void IndexSelectKernel(const Context& ctx,
                       const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
+                       const DenseTensor& index,
-                  bool keep_dim,
+                       int dim,
-                  bool reduce_all,
+                       DenseTensor* output);
-                  DenseTensor* out);
-template <typename T, typename Context>
-void AnyKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -19,27 +19,6 @@
 namespace phi {
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out) {
-  bool reduce_all = false;
-  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
-}
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
               const DenseTensor& x,
@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
-PD_REGISTER_KERNEL(
-    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
-PD_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
 PD_REGISTER_KERNEL(add,
                   CPU,
                   ALL_LAYOUT,
@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply,
                   phi::dtype::bfloat16) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(mean,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanKernel,
-                   float,
-                   double,
-                   bool,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
 PD_REGISTER_KERNEL(add,
                   GPU,
                   ALL_LAYOUT,

--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -16,43 +16,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out);
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out);
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out);
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out);
 template <typename T, typename Context>
 void AddRawKernel(const Context& dev_ctx,
                  const DenseTensor& x,
@@ -149,29 +114,4 @@ DenseTensor Multiply(const Context& dev_ctx,
  return dense_out;
 }
-template <typename T, typename Context>
-DenseTensor Mean(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
-  return dense_out;
-}
-template <typename T, typename Context>
-DenseTensor Sum(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
-  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
-  return dense_out;
-}
 }  // namespace phi
--- a/paddle/phi/kernels/reduce_min_kernel.h
+++ b/paddle/phi/kernels/reduce_min_kernel.h
@@ -19,17 +19,9 @@
 namespace phi {
 template <typename T, typename Context>
-void MinRawKernel(const Context& dev_ctx,
+void MultiplexGradKernel(const Context& ctx,
-                  const DenseTensor& x,
+                         const DenseTensor& ids,
-                  const std::vector<int64_t>& dims,
+                         const DenseTensor& out_grad,
-                  bool keep_dim,
+                         std::vector<DenseTensor*> ins_grad);
-                  bool reduce_all,
-                  DenseTensor* out);
-template <typename T, typename Context>
-void MinKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/multiplex_kernel.h
+++ b/paddle/phi/kernels/multiplex_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/qr_kernel.h
+++ b/paddle/phi/kernels/qr_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r);
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/reduce_all_kernel.h"
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-namespace phi {
-template <typename T, typename Context>
-void AllKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
-#endif
--- a/paddle/phi/kernels/reduce_kernel.cc
+++ b/paddle/phi/kernels/reduce_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_kernel.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  ProdRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+}  // namespace phi
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+PD_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
+PD_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
--- a/paddle/phi/kernels/reduce_kernel.h
+++ b/paddle/phi/kernels/reduce_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+namespace phi {
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out);
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
+  return dense_out;
+}
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
+  return dense_out;
+}
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/reduce_max_kernel.h"
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-namespace phi {
-template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(
-    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
-#endif
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/reduce_min_kernel.h"
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-namespace phi {
-template <typename T, typename Context>
-void MinKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-}  // namespace phi
-PD_REGISTER_KERNEL(
-    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
-#endif
--- a/paddle/phi/kernels/roi_align_grad_kernel.h
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+namespace phi {
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx);
+}  // namespace phi
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -20,7 +20,7 @@
 namespace phi {
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,

--- a/paddle/phi/kernels/roll_grad_kernel.h
+++ b/paddle/phi/kernels/roll_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/roll_kernel.h
+++ b/paddle/phi/kernels/roll_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                      const SparseCooTensor& x,
                      const DenseTensor& rulebook,
                      const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
@@ -41,7 +41,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                    const SparseCooTensor& x,
                                    const DenseTensor& rulebook,
                                    const DenseTensor& kernel,
-                                    const SparseCooTensor& out_grad,
+                                    const DenseTensor& out_grad,
                                    const std::vector<int>& paddings,
                                    const std::vector<int>& dilations,
                                    const std::vector<int>& strides,

--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
 template <typename T, typename Context>
 void ProductRuleBook(const Context& dev_ctx,
                     const SparseCooTensor& x,
-                     const DenseTensor& kernel,
+                     const std::vector<int>& kernel_sizes,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
@@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx,
                     const bool subm,
                     DenseTensor* rulebook,
                     DenseTensor* counter_per_kernel) {
-  const auto& kernel_dims = kernel.dims();
  const int64_t non_zero_num = x.nnz();
  const auto& non_zero_indices = x.non_zero_indices();
  const int* indices_ptr = non_zero_indices.data<int>();
  int* counter_ptr = counter_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  memset(counter_ptr, 0, kernel_size * sizeof(int));
  int rulebook_len = 0;
  // calc the rulebook_len
  const auto& x_dims = x.dims();
  const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_kernel_dims(
+      1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
  const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
  const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
  const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
@@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx,
  auto f_calc_rulebook = [&](int* rulebook_ptr) {
    int kernel_index = 0, rulebook_index = 0;
-    for (int kz = 0; kz < kernel_dims[0]; kz++) {
+    for (int kz = 0; kz < kernel_sizes[0]; kz++) {
-      for (int ky = 0; ky < kernel_dims[1]; ky++) {
+      for (int ky = 0; ky < kernel_sizes[1]; ky++) {
-        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+        for (int kx = 0; kx < kernel_sizes[2]; kx++) {
          ++kernel_index;
          for (int64_t i = 0; i < non_zero_num; i++) {
            int batch = indices_ptr[i];

--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                      const SparseCooTensor& x,
                      const DenseTensor& rulebook,
                      const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
@@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
            rulebook_len,
            in_channels,
            in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
+  Gather<T>(out_grad.data<T>(),
            rulebook_ptr + rulebook_len * 2,
            rulebook_len,
            out_channels,

--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx,
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
  phi::funcs::sparse::GetOutShape(
-      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
@@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx,
  ProductRuleBook<T, Context>(dev_ctx,
                              x,
-                              kernel,
+                              kernel_sizes,
                              subm_paddings,
                              dilations,
                              subm_strides,

--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,11 +23,15 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 namespace phi {
 namespace sparse {
+using Dims4D = phi::funcs::sparse::Dims4D;
 // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
 // this kernel with phi::GatherCUDAKernel;
 // Vectorization can be used to improve read and write bandwidth
@@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
  return new_end.first;
 }
+template <typename T>
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              T* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+template <typename T>
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  T* out_indices,
+                                  T* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+template <typename T>
+__global__ void ProductRuleBookKernel(const T* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      T* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const std::vector<int>& kernel_sizes,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  ProductRuleBookKernel<int><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               kernel_size * sizeof(int),
+                               dev_ctx.stream()>>>(indices_ptr,
+                                                   d_x_dims,
+                                                   d_kernel_dims,
+                                                   d_out_dims,
+                                                   non_zero_num,
+                                                   d_paddings,
+                                                   d_dilations,
+                                                   d_strides,
+                                                   subm,
+                                                   rulebook_ptr,
+                                                   counter_ptr,
+                                                   in_indexs.data<int>());
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+  DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x,
+                                         config.thread_per_block,
+                                         kernel_size * sizeof(int),
+                                         dev_ctx.stream()>>>(
+        val_result.data<int>(),
+        len,
+        rulebook_len,
+        kernel_size,
+        rulebook_ptr,
+        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<int><<<1, 1>>>(
+      unique_key_ptr,
+      new_end,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+  int* out_indices_ptr = out_indices.data<int>();
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<int><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(unique_key_ptr,
+                                               unique_value_ptr,
+                                               out_index_ptr,
+                                               out_non_zero_num,
+                                               rulebook_len,
+                                               d_out_dims,
+                                               out_indices_ptr,
+                                               rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
 }  // namespace sparse
 }  // namespace phi
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                      const SparseCooTensor& x,
                      const DenseTensor& rulebook,
                      const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
@@ -140,8 +140,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
  GatherKernel<T, int><<<config.block_per_grid.x,
                         config.thread_per_block.x,
                         0,
-                         dev_ctx.stream()>>>(
+                         dev_ctx.stream()>>>(out_grad.data<T>(),
-      out_grad.non_zero_elements().data<T>(),
                                             rulebook_ptr + rulebook_len * 2,
                                             out_grad_features_ptr,
                                             rulebook_len,

--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thrust/execution_policy.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/index_impl.cu.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 namespace phi {
 namespace sparse {
-using Dims4D = phi::funcs::sparse::Dims4D;
-__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
-                                              const int n,
-                                              const int rulebook_len,
-                                              const int kernel_size,
-                                              int* rulebook_ptr,
-                                              int* counter_ptr) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int cache_count[];  // kernel_size
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    cache_count[i] = 0;
-  }
-  __syncthreads();
-  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
-    int index = indexs[i];
-    int kernel_index = rulebook_ptr[index];
-    rulebook_ptr[index + rulebook_len] = -1;
-    rulebook_ptr[index + 2 * rulebook_len] = -1;
-    rulebook_ptr[index] = -1;
-    atomicAdd(&cache_count[kernel_index], 1);
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicSub(&counter_ptr[i], cache_count[i]);
-  }
-}
-/**
- * @brief: update the out index and indices
- * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
- * out_indexs: indicates the position of the output index in the rulebook
- * rulebook_len: indicates the length of rulebook
- * out_dims: indicates the output dims
- * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
- * rulebook_out_indexs: the output index in rulebook
-**/
-__global__ void UpdateIndexKernel(const int* unique_keys,
-                                  const int* unique_values,
-                                  const int* out_indexs,
-                                  const int non_zero_num,
-                                  const int rulebook_len,
-                                  const Dims4D out_dims,
-                                  int* out_indices,
-                                  int* rulebook_out_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    const int index = unique_keys[i];
-    int batch, x, y, z;
-    phi::funcs::sparse::IndexToPoint<Dims4D>(
-        index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    out_indices[i + non_zero_num] = z;
-    out_indices[i + non_zero_num * 2] = y;
-    out_indices[i + non_zero_num * 3] = x;
-    // update rulebook
-    int start = unique_values[i];
-    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
-    // max(end-start) = kernel_size
-    for (int j = start; j < end; j++) {
-      rulebook_out_indexs[out_indexs[j]] = i;
-    }
-  }
-}
-/**
- * @brief product rulebook
- * for input_i in x_indices:
- *   if input_i participate in the convolution calculation:
- *       infer the output_i by input_i and kernel_i
- *       save output_i
- *
- * x_indices: the indices of input features
- * x_dims: the input dims
- * kernel_dims: the kernel dims
- * out_dims: the output dims
- * non_zero_num: the number of input features
- * rulebook: the rulebook to save the kernel index, input index and output index
- * counter: save the number of times each location in the kernel participates in
- *the caculation
-**/
-__global__ void ProductRuleBookKernel(const int* x_indices,
-                                      const Dims4D x_dims,
-                                      const Dims4D kernel_dims,
-                                      const Dims4D out_dims,
-                                      const int64_t non_zero_num,
-                                      const Dims4D paddings,
-                                      const Dims4D dilations,
-                                      const Dims4D strides,
-                                      const bool subm,
-                                      int* rulebook,
-                                      int* counter,
-                                      int* in_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int counter_buf[];  // kernel_size
-  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
-  const int offset = kernel_size * non_zero_num;
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    counter_buf[i] = 0;
-  }
-  __syncthreads();
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    int kernel_index = 0;
-    int batch = x_indices[i];
-    int in_z = x_indices[i + non_zero_num];
-    int in_y = x_indices[i + 2 * non_zero_num];
-    int in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
-    for (int kz = 0; kz < kernel_dims[1]; kz++) {
-      for (int ky = 0; ky < kernel_dims[2]; ky++) {
-        for (int kx = 0; kx < kernel_dims[3]; kx++) {
-          int in_i = -1, out_index = -1, kernel_i = -1;
-          if (phi::funcs::sparse::Check(x_dims,
-                                        kernel_dims,
-                                        paddings,
-                                        dilations,
-                                        strides,
-                                        in_x,
-                                        in_y,
-                                        in_z,
-                                        kx,
-                                        ky,
-                                        kz)) {
-            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
-            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
-            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
-            in_i = i;
-            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
-                batch, out_x, out_y, out_z, out_dims);
-            atomicAdd(&counter_buf[kernel_index], 1);
-            kernel_i = kernel_index;
-          }
-          rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
-          ++kernel_index;
-        }
-      }
-    }
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicAdd(&counter[i], counter_buf[i]);
-  }
-}
-// brief: calculation the distance between start and end
-__global__ void DistanceKernel(const int* start,
-                               const int* end,
-                               int* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-// the basic algorithm can refer to convolution_kernel.cc or
-// the second paper
-// example:
-// 1. the rulebook:
-//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
-//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
-// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
-// 3. sorted the (key, value)
-// 4. unique the (key, value):
-//  unique_key:     20, 25, 30, 33
-//  unique_values:  0, 2, 3, 5
-//  the index of unique_values is: 0, 1, 2, 3
-// 5. update the out_index by unique_key, uniqe_value and the index of
-// unique_value:
-//  the new out_index: 0, 2, 3, 2, 3, 0, 1
-template <typename T, typename Context>
-int ProductRuleBook(const Context& dev_ctx,
-                    const SparseCooTensor& x,
-                    const DenseTensor& kernel,
-                    const std::vector<int>& paddings,
-                    const std::vector<int>& dilations,
-                    const std::vector<int>& strides,
-                    const DDim& out_dims,
-                    const bool subm,
-                    DenseTensor* rulebook,
-                    DenseTensor* counter_per_kernel,
-                    DenseTensor* offsets_per_kernel,
-                    DenseTensor* out_index,
-                    DenseTensor* unique_key,
-                    DenseTensor* unique_value,
-                    SparseCooTensor* out,
-                    std::vector<int>* h_counter,
-                    std::vector<int>* h_offsets) {
-  const auto& kernel_dims = kernel.dims();
-  const int64_t non_zero_num = x.nnz();
-  const auto& non_zero_indices = x.non_zero_indices();
-  const int* indices_ptr = non_zero_indices.data<int>();
-  DenseTensor in_indexs = phi::Empty<Context>(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
-  int* counter_ptr = counter_per_kernel->data<int>();
-  int* offsets_ptr = offsets_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  const int rulebook_rows = 3;
-  const int rulebook_cols = kernel_size * non_zero_num;
-  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
-  int* rulebook_ptr = rulebook->data<int>();
-  const auto x_dims = x.dims();
-  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
-  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
-  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
-  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
-  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
-  // 1. product rule book
-  phi::funcs::SetConstant<Context, int> set_zero;
-  set_zero(dev_ctx, counter_per_kernel, 0);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-  ProductRuleBookKernel<<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          kernel_size * sizeof(int),
-                          dev_ctx.stream()>>>(indices_ptr,
-                                              d_x_dims,
-                                              d_kernel_dims,
-                                              d_out_dims,
-                                              non_zero_num,
-                                              d_paddings,
-                                              d_dilations,
-                                              d_strides,
-                                              subm,
-                                              rulebook_ptr,
-                                              counter_ptr,
-                                              in_indexs.data<int>());
-// 2. remove -1
-#ifdef PADDLE_WITH_HIP
-  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                             rulebook_ptr,
-                             rulebook_ptr + rulebook_rows * rulebook_cols,
-                             -1);
-  DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
-  int rulebook_len = 0;
-  phi::backends::gpu::GpuMemcpyAsync(
-      &rulebook_len,
-      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
-      sizeof(int),
-#ifdef PADDLE_WITH_HIP
-      hipMemcpyDeviceToHost,
-#else
-      cudaMemcpyDeviceToHost,
-#endif
-      dev_ctx.stream());
-  rulebook_len /= 3;
-  dev_ctx.Wait();
-  if (subm) {
-    // At present, hashtable is not used to map the input and output indexes.
-    // At present, the intermediate output index is generated by normal
-    // convolution,
-    // and then the intermediate output index is subtracted from the input index
-    // to obain the rulebook.
-    // get difference
-    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
-    int32_t* B_key_ptr = in_indexs.data<int>();
-    DenseTensor A_val = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-    DenseTensor B_val = phi::Empty<Context>(
-        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
-    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-        dev_ctx, &A_val, kps::IdentityFunctor<int>());
-    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-        dev_ctx, &B_val, kps::IdentityFunctor<int>());
-    DenseTensor key_result = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
-    DenseTensor val_result = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-#ifdef PADDLE_WITH_HIP
-    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                           counter_ptr,
-                           counter_ptr + kernel_size,
-                           offsets_ptr);
-    std::vector<int> offsets(kernel_size, 0);
-    // TODO(zhangkaihuo): used unified memcpy interface
-    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
-                                       offsets_ptr,
-                                       kernel_size * sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-    thrust::pair<int*, int*> end;
-    // Because set_diff does not support duplicate data, set_diff is performed
-    // separately for each segment of data.
-    // TODO(zhangkaihuo): Using hashtable here may get better performance,
-    // further tests ared needed.
-    for (int i = 0; i < kernel_size; i++) {
-      int start = offsets[i];
-      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
-      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
-      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
-      end =
-#ifdef PADDLE_WITH_HIP
-          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                                        A_key_ptr + start,
-                                        A_key_ptr + stop,
-                                        B_key_ptr,
-                                        B_key_ptr + x.nnz(),
-                                        A_val.data<int>() + start,
-                                        B_val.data<int>(),
-                                        key_result_start,
-                                        val_result_start);
-    }
-    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-        key_result.data<int>(),
-        end.first,
-        key_result.data<int>() + rulebook_len);
-    int len = 0;
-    phi::backends::gpu::GpuMemcpyAsync(&len,
-                                       key_result.data<int>() + rulebook_len,
-                                       sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-    // set the diff value = -1, and update counter
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
-    SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
-                                    config.thread_per_block,
-                                    kernel_size * sizeof(int),
-                                    dev_ctx.stream()>>>(val_result.data<int>(),
-                                                        len,
-                                                        rulebook_len,
-                                                        kernel_size,
-                                                        rulebook_ptr,
-                                                        counter_ptr);
-// remove -1
-#ifdef PADDLE_WITH_HIP
-    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                               rulebook_ptr,
-                               rulebook_ptr + 3 * rulebook_len,
-                               -1);
-    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
-    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
-                                       key_result.data<int>() + rulebook_len,
-                                       sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-    rulebook_len /= 3;
-  }
-#ifdef PADDLE_WITH_HIP
-  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                         counter_ptr,
-                         counter_ptr + kernel_size,
-                         offsets_ptr);
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#endif
-  rulebook->Resize({rulebook_rows, rulebook_len});
-  // 3. sorted or merge the out index
-  out_index->ResizeAndAllocate({rulebook_len});
-  unique_value->ResizeAndAllocate({rulebook_len});
-  unique_key->ResizeAndAllocate({rulebook_len});
-  int* out_index_ptr = out_index->data<int>();
-  int* unique_value_ptr = unique_value->data<int>();
-  int* unique_key_ptr = unique_key->data<int>();
-  int* new_end = SortedAndUniqueIndex(dev_ctx,
-                                      rulebook_ptr + 2 * rulebook_len,
-                                      rulebook_len,
-                                      out_index,
-                                      unique_key,
-                                      unique_value);
-  // thrust::distance doesn't support stream parameters
-  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-  // new_end.first);
-  DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end,
-                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-  int out_non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      hipMemcpyDeviceToHost,
-      dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      cudaMemcpyDeviceToHost,
-      dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-  // 5. update out_indices and rulebook by unique_value_ptr
-  const int64_t sparse_dim = 4;
-  DenseTensorMeta indices_meta(
-      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
-  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-  int* out_indices_ptr = out_indices.data<int>();
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-  UpdateIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(unique_key_ptr,
-                                          unique_value_ptr,
-                                          out_index_ptr,
-                                          out_non_zero_num,
-                                          rulebook_len,
-                                          d_out_dims,
-                                          out_indices_ptr,
-                                          rulebook_ptr + 2 * rulebook_len);
-  out->SetMember(out_indices, out_values, out_dims, true);
-  return rulebook_len;
-}
 /**
 * x: (N, D, H, W, C)
 * kernel: (D, H, W, C, OC)
@@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx,
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
  phi::funcs::sparse::GetOutShape(
-      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
-  out->set_dims(out_dims);
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
@@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx,
  int n = ProductRuleBook<T, Context>(dev_ctx,
                                      x,
-                                      kernel,
+                                      kernel_sizes,
                                      subm_paddings,
                                      dilations,
                                      subm_strides,

--- a/paddle/phi/kernels/tril_triu_grad_kernel.h
+++ b/paddle/phi/kernels/tril_triu_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/tril_triu_kernel.h
+++ b/paddle/phi/kernels/tril_triu_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace phi {
-#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \
+#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \
  KernelSignature func_name##GradOpArgumentMapping(               \
      const ArgumentMappingContext& ctx) {                        \
    return KernelSignature(op_name "_grad",                       \
@@ -25,7 +25,7 @@ namespace phi {
                           {GradVarName("X")});                   \
  }
-#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \
+#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \
  KernelSignature func_name##GradOpArgumentMapping(                 \
      const ArgumentMappingContext& ctx) {                          \
    return KernelSignature(op_name "_grad",                         \
@@ -36,25 +36,29 @@ namespace phi {
 #define comma ,
-DefineActGradDepXOpArgMap(Cos, "cos", );                           // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
-DefineActGradDepXOpArgMap(Tan, "tan", );                           // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
-DefineActGradDepXOpArgMap(Acos, "acos", );                         // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
-DefineActGradDepXOpArgMap(Sin, "sin", );                           // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
-DefineActGradDepXOpArgMap(Asin, "asin", );                         // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
-DefineActGradDepXOpArgMap(Atan, "atan", );                         // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
-DefineActGradDepXOpArgMap(Sinh, "sinh", );                         // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
-DefineActGradDepXOpArgMap(Cosh, "cosh", );                         // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
-DefineActGradDepXOpArgMap(Asinh, "asinh", );                       // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
-DefineActGradDepXOpArgMap(Acosh, "acosh", );                       // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
-DefineActGradDepXOpArgMap(Atanh, "atanh", );                       // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
-DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max");  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
-DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha");       // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
-DefineActGradDepXOpArgMap(ThresholdedRelu,
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
                               "thresholded_relu",
-                          "threshold");  // NOLINT
+                               "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
-DefineActGradDepOutOpArgMap(Relu, "relu", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );  // NOLINT
-DefineActGradDepOutOpArgMap(Tanh, "tanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );  // NOLINT
 KernelSignature ReluDoubleGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
@@ -85,11 +89,31 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
 }
+KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
+}
+KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"alpha"},
+                         {GradVarName("X")});
+}
+KernelSignature EluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
+}
 }  // namespace phi
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -118,3 +142,13 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
                           phi::LeakyReluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
                           phi::ThresholdedReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
+                           phi::SoftShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
+                           phi::HardShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad,
+                           phi::TanhShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ b/paddle/fluid/operators/tril_triu_op.cu
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,24 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/phi/core/compat/op_utils.h"
-namespace ops = paddle::operators;
+namespace phi {
-namespace plat = paddle::platform;
+KernelSignature GridSamplerOpArgumentMapping(
-REGISTER_OP_CUDA_KERNEL(
+    const ArgumentMappingContext& ctx) {
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
+  return KernelSignature("grid_sample",
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
+                         {"X", "Grid"},
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
+                         {"mode", "padding_mode", "align_corners"},
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
+                         {"Output"});
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+}
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
+KernelSignature GridSamplerGradOpArgumentMapping(
-    tril_triu_grad,
+    const ArgumentMappingContext& ctx) {
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
+  return KernelSignature("grid_sample_grad",
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+                         {"X", "Grid", GradVarName("Output")},
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+                         {"mode", "padding_mode", "align_corners"},
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+                         {GradVarName("X"), GradVarName("Grid")});
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+}
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+}  // namespace phi
+// use Python API name as kernel name
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample);
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad);
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad,
+                           phi::GridSamplerGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/index_select_sig.cc
+++ b/paddle/phi/ops/compat/index_select_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature IndexSelectGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_select_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(index_select_grad,
+                           phi::IndexSelectGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/multiplex_sig.cc
+++ b/paddle/phi/ops/compat/multiplex_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"});
+}
+KernelSignature MultiplexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/qr_sig.cc
+++ b/paddle/phi/ops/compat/qr_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -52,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
      return KernelSignature(
-      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+          "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
 }
 KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
@@ -107,10 +118,6 @@ KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
 KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
  if (ctx.IsDenseTensorInput("X")) {
    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
-    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
-    // InferShape, so we must return the "all_raw" KernelSignature.
-    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the "all_raw" KernelSignature
    if (ctx.IsForInferShape() || reduce_all) {
      return KernelSignature(
          "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -135,6 +142,7 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);

--- a/paddle/phi/ops/compat/roi_align_sig.cc
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -16,7 +16,7 @@
 namespace phi {
-KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("roi_align",
                         {"X", "ROIs", "RoisNum"},
                         {"pooled_height",
@@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
                         {"Out"});
 }
+KernelSignature RoiAlignGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align_grad",
+                         {"X", "ROIs", "RoisNum", GradVarName("Out")},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {GradVarName("X")});
+}
 }  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -12,26 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_any_kernel.h"
+#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 namespace phi {
-template <typename T, typename Context>
+KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
-void AnyKernel(const Context& dev_ctx,
+  if (ctx.HasInput("ShiftsTensor")) {
-               const DenseTensor& x,
+    return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
-               const std::vector<int64_t>& dims,
+  }
-               bool keep_dim,
+  return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
-               DenseTensor* out) {
+}
-  bool reduce_all = false;
-  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roll_grad",
+                         {"X", GradVarName("Out")},
+                         {"shifts", "axis"},
+                         {GradVarName("X")});
 }
 }  // namespace phi
-PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);
-PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
-#endif
--- a/paddle/phi/ops/compat/tile_sig.cc
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
  if (ctx.HasInput("RepeatTimes")) {
    return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    const auto& repeat_times =
+        paddle::any_cast<std::vector<int>>(ctx.Attr("repeat_times"));
+    if (!ctx.IsRuntime() && !repeat_times.empty()) {
+      return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+    }
    return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
  } else {
    return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});

--- a/paddle/phi/ops/compat/tril_triu_sig.cc
+++ b/paddle/phi/ops/compat/tril_triu_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"});
+}
+KernelSignature TrilTriuGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu_grad",
+                         {GradVarName("Out")},
+                         {"diagonal", "lower"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping);
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"

--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -132,11 +132,12 @@ void TestConv3dBase(const std::vector<int>& indices,
    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
    if (backward) {
-      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
+      std::vector<DenseTensor> grads =
+          sparse::Conv3dGrad<T>(dev_ctx_cpu,
                                x_tensor,
                                rulebook,
                                kernel_tensor,
-                                                             out,
+                                out.non_zero_elements(),
                                paddings,
                                dilations,
                                strides,
@@ -231,11 +232,12 @@ void TestConv3dBase(const std::vector<int>& indices,
  f_verify(h_features_tensor.data<T>(), correct_out_features);
  if (backward) {
-    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
+    std::vector<DenseTensor> grads =
+        sparse::Conv3dGrad<T>(dev_ctx_gpu,
                              d_x_tensor,
                              d_rulebook,
                              d_kernel_tensor,
-                                                           d_out,
+                              d_out.non_zero_elements(),
                              paddings,
                              dilations,
                              strides,

--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"

--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -21,11 +21,12 @@ from paddle.fluid import framework
 from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
-from .dist_context import get_default_distributed_context
+from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
+from .process_mesh import ProcessMesh
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list):
    return compatible_result
+def merge_process_mesh_two(pm1, pm2):
+    process_set1 = set()
+    process_set2 = set()
+    if pm1 is None and pm2 is None:
+        return None
+    if pm1 is not None:
+        process_set1 = set(pm1.processes)
+    if pm2 is not None:
+        process_set2 = set(pm2.processes)
+    merged_process_set = process_set1.union(process_set2)
+    merged_process_mesh = ProcessMesh(list(merged_process_set))
+    return merged_process_mesh
 class Completer:
    def __init__(self, dist_context):
        assert dist_context is not None
@@ -119,7 +134,9 @@ class Completer:
            return False
        tensor_desc = tensor_node.var()
        # Skip reader tensor
-        if tensor_desc.type() == core.VarDesc.VarType.READER:
+        if tensor_desc.type() == core.VarDesc.VarType.READER \
+            or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES:
            return False
        tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
            tensor_node)
@@ -185,7 +202,7 @@ class Completer:
        op_dist_attr = dist_op.dist_attr
        if fwd:
            for tensor_node in op_node.inputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
                        continue
                    tensor_desc = tensor_node.var()
@@ -208,7 +225,7 @@ class Completer:
            # Find the most compatible implemenetations from the distributed operator
            op_dist_impl = find_best_compatible_distributed_operator_impl(
                dist_op, fwd=True)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
+            if op_dist_impl is not None:
                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                if dim_changed:
                    changed = True
@@ -220,7 +237,7 @@ class Completer:
                    op_dist_attr.impl_idx = op_dist_impl.idx
        else:
            for tensor_node in op_node.outputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
                        continue
                    tensor_desc = tensor_node.var()
@@ -243,7 +260,7 @@ class Completer:
            # Find the most compatible implemenetations from the distributed operator
            op_dist_impl = find_best_compatible_distributed_operator_impl(
                dist_op, fwd=False)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
+            if op_dist_impl is not None:
                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                if dim_changed:
                    changed = True
@@ -255,49 +272,26 @@ class Completer:
                    op_dist_attr.impl_idx = op_dist_impl.idx
        return changed
-    def _update_process_mesh(self):
+    def _update_dims_mapping_between_graphs(self):
-        def _find_nearset_node(nodes, idx):
-            for node in reversed(nodes[:idx]):
-                node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                    node)
-                if node_dist_attr.process_mesh is not None:
-                    return node
-        total_reach_fix_point = False
-        while not total_reach_fix_point:
-            total_changed = False
-            for is_fwd in [True, False]:
-                all_nodes = self._dist_context.serial_ordered_nodes \
-                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
-                reach_fix_point = False
-                while not reach_fix_point:
        changed = False
-                    for idx, node in enumerate(all_nodes):
+        for parent_node, child_node in self._node_pairs_between_graphs:
-                        nearest_node = _find_nearset_node(
+            parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                            self._dist_context.serial_ordered_nodes, idx)
+                parent_node)
-                        if nearest_node is None:
+            child_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                            continue
+                child_node)
-                        nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph(
+            parent_node_dims_mapping = parent_node_dist_attr.dims_mapping
-                            nearest_node)
+            child_node_dims_mapping = child_node_dist_attr.dims_mapping
-                        nearest_process_mesh = nearest_node_dis_attr.process_mesh
+            compatible_dims_mapping = compute_compatible_dims_mapping(
-                        cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                [parent_node_dims_mapping, child_node_dims_mapping])
-                            node)
+            if (compatible_dims_mapping is not None) \
-                        cur_process_mesh = cur_node_dist_attr.process_mesh
+                and (compatible_dims_mapping != parent_node_dims_mapping):
-                        compatible_process_mesh = compute_compatible_process_mesh(
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
-                            [cur_process_mesh, nearest_process_mesh])
-                        if compatible_process_mesh is not None \
-                            and cur_process_mesh != compatible_process_mesh:
-                            cur_node_dist_attr.process_mesh = compatible_process_mesh
                changed = True
-                    if changed:
+            if (compatible_dims_mapping is not None) \
-                        reach_fix_point = False
+                and (compatible_dims_mapping != child_node_dims_mapping):
-                        total_changed = True
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
-                    else:
+                changed = True
-                        reach_fix_point = True
+        return changed
-            if total_changed:
-                total_reach_fix_point = False
-            else:
-                total_reach_fix_point = True
    def _update_dims_mapping(self):
        # Complete dims_mapping for each node
@@ -318,11 +312,314 @@ class Completer:
                            node, fwd=is_fwd)
                        if op_changed:
                            changed = True
+                graph_changed = self._update_dims_mapping_between_graphs()
+                if graph_changed:
+                    changed = True
            if changed:
                reach_fix_point = False
            else:
                reach_fix_point = True
+    def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
+        op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+        # Set the process mesh of the op node by its nearest op node
+        if not op_dist_attr.is_annotated("process_mesh"):
+            process_mesh = op_dist_attr.process_mesh
+            nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            nearest_process_mesh = nearest_op_dis_attr.process_mesh
+            compatible_process_mesh = compute_compatible_process_mesh(
+                [process_mesh, nearest_process_mesh])
+            if compatible_process_mesh is not None \
+                and process_mesh != compatible_process_mesh:
+                op_dist_attr.process_mesh = compatible_process_mesh
+        # Skip the process_mesh setting of inputs and outputs of while_op
+        if op_dist_attr.op_type == "while":
+            return
+        # Set the process mesh of the op node's leaf-inputs
+        for tensor_node in op_node.inputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                # Skip the non-leaf var node
+                if len(tensor_node.inputs) != 0:
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+        # Set the process mesh of the op node's outputs
+        for tensor_node in op_node.outputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+    def _update_process_mesh_for_specials(self):
+        def _find_nearest_tensor_node_before(nodes, idx, var_name):
+            for node in reversed(nodes[:idx]):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+        def _find_nearest_tensor_node_after(nodes, idx, var_name):
+            for node in nodes[idx + 1:]:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+        def _find_nodes_related_to_cond(source_node):
+            related_nodes = []
+            visited = set()
+            frontier = list()
+            frontier.append(source_node)
+            # BFS
+            while len(frontier) != 0:
+                cur = frontier[0]
+                frontier = frontier[1:]
+                if _node_id(cur) in visited:
+                    continue
+                # TODO: need more restrictions
+                for node in cur.inputs:
+                    if node.is_var() and node.var() is not None:
+                        if node.var().type() != core.VarDesc.VarType.READER \
+                            and len(node.var().shape()) == 1:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                    if node.is_op() and node.op() is not None:
+                        flag = True
+                        if node.op().type() == "create_py_reader" \
+                            or node.op().type() == "create_double_buffer_reader" \
+                            or node.op().type() == "read":
+                            flag = False
+                        for tensor_node in node.inputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        for tensor_node in node.outputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        if flag:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                visited.add(_node_id(cur))
+            return related_nodes
+        # Amend the process meshes related to while_op
+        for while_op_node, while_op_node_idx in self._while_op_nodes.values():
+            sub_graph_id = while_op_node.op()._block_attr_id("sub_block")
+            sub_graph = self._dist_context._serial_graph.get_sub_graph(
+                sub_graph_id)
+            sub_graph_nodes = list(sub_graph.all_nodes())
+            while_dist_op = self._dist_context.get_dist_op_for_graph(
+                while_op_node)
+            while_op_dist_attr = while_dist_op.dist_attr
+            # Step 1: set the process mesh of while_op to the merged process mesh of its subblock
+            merged_process_mesh = while_op_dist_attr.process_mesh
+            for node in sub_graph_nodes:
+                if (node.is_var() and node.var() is not None) \
+                    or (node.is_op() and node.op() is not None):
+                    dist_attr = self._dist_context.get_dist_attr_for_graph(node)
+                    merged_process_mesh = merge_process_mesh_two(
+                        merged_process_mesh, dist_attr.process_mesh)
+            while_op_dist_attr.process_mesh = merged_process_mesh
+            # Step 2: set the related nodes of while_op to the process mesh of while_op
+            # Step 2.1: Find related nodes of cond var the graph of while_op
+            cond_tensor_related_nodes = []
+            cond_tensor_name = while_op_node.op().input("Condition")[0]
+            cond_tensor_node = None
+            for node in while_op_node.inputs:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name:
+                    cond_tensor_node = node
+                    cond_tensor_related_nodes.append(cond_tensor_node)
+                    break
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+            # Step 2.2: Find related nodes of cond var in the subgraph of while_op
+            cond_tensor_node = None
+            for node in reversed(sub_graph_nodes):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name \
+                        and len(node.outputs) == 0:
+                    cond_tensor_node = node
+                    break
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+            # Step 2.3: Add the StepScops output of while_op
+            stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
+            stepscopes_tensor_node = None
+            for output_node in while_op_node.outputs:
+                if output_node.is_var() and output_node.var() is not None \
+                    and output_node.var().name() == stepscopes_tensor_name:
+                    stepscopes_tensor_node = output_node
+            cond_tensor_related_nodes.append(stepscopes_tensor_node)
+            # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op
+            for node in cond_tensor_related_nodes:
+                tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    node)
+                tensor_dist_attr.process_mesh = merged_process_mesh
+            # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes
+            while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+            # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes
+            while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                if nearest_tensor_node is None:
+                    nearest_tensor_node = _find_nearest_tensor_node_after(
+                        self._dist_context.serial_ordered_nodes,
+                        while_op_node_idx, tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+        # Amend the process meshes related to array
+        for array_node_list in self._array_nodes.values():
+            merged_process_mesh = None
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                merged_process_mesh = merge_process_mesh_two(
+                    merged_process_mesh, dist_attr.process_mesh)
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                dist_attr.process_mesh = merged_process_mesh
+    def _update_process_mesh(self):
+        ordered_op_nodes = self._dist_context._serial_ordered_op_nodes
+        # Step 1: Set the annotated process meshes from tensors to the first ops using them
+        ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes
+        for tensor_node in ordered_tensor_nodes:
+            tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                tensor_node)
+            if not tensor_dist_attr.is_annotated("process_mesh"):
+                continue
+            first_op_node = None
+            for op_node in ordered_op_nodes:
+                # TODO: Need a better rule for the control flow ops.
+                # For now, do not set the process mesh of while_op from its inputs
+                if op_node.op().type() == "while":
+                    continue
+                for input_tensor_node in op_node.inputs:
+                    if _node_id(tensor_node) == _node_id(input_tensor_node):
+                        first_op_node = op_node
+                        break
+                if first_op_node is not None:
+                    break
+            if first_op_node is None:
+                continue
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                first_op_node)
+            if op_dist_attr is not None and not op_dist_attr.is_annotated(
+                    "process_mesh"):
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and op_dist_attr.process_mesh != compatible_process_mesh:
+                    op_dist_attr.process_mesh = compatible_process_mesh
+        # Step 2: set the process meshes of ops with the nearest op before them
+        # Step 2.1: find the first op node which has the process mesh
+        idx_of_first_op_node_has_process_mesh = -1
+        for idx, op_node in enumerate(ordered_op_nodes):
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            if op_dist_attr.process_mesh is not None \
+                and idx_of_first_op_node_has_process_mesh == -1:
+                idx_of_first_op_node_has_process_mesh = idx
+                # Reuse the following method to set the related tensors for same op node
+                self._update_process_mesh_by_nearest(op_node, op_node)
+        # Step 2.2: set the process meshes of ops by the nearest op node after the first op node
+        if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
+            return None
+        for idx, op_node in enumerate(ordered_op_nodes[
+                idx_of_first_op_node_has_process_mesh + 1:]):
+            original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1
+            nearest_op_node = ordered_op_nodes[original_idx - 1]
+            nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            assert nearest_op_dist_attr.process_mesh is not None
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+        # Step 2.3: set the process meshes of ops by the nearest op node before the first op node
+        nearest_op_node = ordered_op_nodes[
+            idx_of_first_op_node_has_process_mesh]
+        for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]:
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+        # Step 3: adjust the process meshes for special ops
+        self._update_process_mesh_for_specials()
+    def _prepare(self):
+        self._while_op_nodes = {}
+        self._array_nodes = {}
+        self._node_pairs_between_graphs = []
+        all_nodes = self._dist_context.serial_ordered_nodes
+        for idx, node in enumerate(all_nodes):
+            if node.is_op():
+                if node.op().type() == "while":
+                    self._while_op_nodes[_node_id(node)] = (node, idx)
+                if node.op().type() == "read_from_array":
+                    array_var_name = node.op().input("X")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                if node.op().type() == "write_to_array":
+                    array_var_name = node.op().output("Out")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                    self._array_nodes[array_var_name].append(node.outputs[0])
+            if node.is_var() and node.var() is not None:
+                if node.node.graph_id() != 0:
+                    for before_node in reversed(all_nodes[:idx]):
+                        if before_node.is_var() and before_node.var() is not None \
+                            and before_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and before_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (before_node, node))
+                    for after_node in all_nodes[idx + 1:]:
+                        if after_node.is_var() and after_node.var() is not None \
+                            and after_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and after_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (after_node, node))
    def complete_forward_annotation(self, serial_main_program):
        """ Complete annotation for the partial annotated serial_main_program.
        Arguments:
@@ -336,24 +633,24 @@ class Completer:
        # Initialize distributed attributes for all var and op node in serial_main_program
        self._dist_context.init_dist_attr_for_program()
+        # print_program_with_dist_attr(serial_main_program, self._dist_context)
        # Initialize distributed attributes for all var and op node in graph
        self._dist_context.init_dist_attr_for_graph()
+        self._prepare()
        self._update_process_mesh()
-        # Complete dims_mapping for each node
        self._update_dims_mapping()
        # Copy the corresponding distributed attribute from graph to serial_main_program
        self._dist_context.copy_dist_attr_from_graph_to_program()
        self._dist_context.clear_dist_info_for_graph()
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
        # Do the validation check and amend some completion
        self._dist_context.amend_dist_attr_for_program()
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
        self._dist_context.validate_dist_attr_for_program()
        return serial_main_program

--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -175,6 +175,7 @@ class TensorDistributedAttribute:
 class OperatorDistributedAttribute:
    def __init__(self):
        self._process_mesh = None
+        self._op_type = None
        self._impl_type = None
        self._impl_idx = None
        self._inputs_dist_attrs = {}
@@ -194,11 +195,23 @@ class OperatorDistributedAttribute:
            if isinstance(process_mesh, list):
                process_mesh = ProcessMesh(process_mesh)
            self._process_mesh = copy.deepcopy(process_mesh)
+            # In while op, the proess mesh is not shared by all inputs and outputs 
+            if self._op_type == "while":
+                return None
            for dist_attr in self._inputs_dist_attrs.values():
                dist_attr.process_mesh = process_mesh
            for dist_attr in self._outputs_dist_attrs.values():
                dist_attr.process_mesh = process_mesh
+    @property
+    def op_type(self):
+        return self._op_type
+    @op_type.setter
+    def op_type(self, op_type):
+        if op_type is not None:
+            self._op_type = op_type
    @property
    def impl_type(self):
        return self._impl_type
@@ -326,6 +339,8 @@ class OperatorDistributedAttribute:
                    assert False, "No setter for {} in args {}.".format(
                        key, dist_attr)
        # Make sure proscess_meshes in dist op be same
+        if self.op_type == "while":
+            return None
        process_meshes = []
        process_meshes.append(self.process_mesh)
        for tensor_dist_attr in self.inputs_dist_attrs.values():

--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -15,6 +15,7 @@
 import copy
 from collections import defaultdict
 from paddle.fluid import framework
+from paddle.fluid.framework import get_flags, set_flags
 from paddle.fluid import core
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context):
    _g_default_distributed_context = dist_context
+def _node_id(node):
+    return (node.node.graph_id(), node.node.id())
 class DistributedContext:
    """
    DistributedContext is used to collect related distributed information for program and graph.
@@ -146,7 +151,7 @@ class DistributedContext:
                return None
    def get_dist_tensor_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
        return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
    def get_dist_op_for_program(self, serial_op):
@@ -168,7 +173,7 @@ class DistributedContext:
            del self._dist_ops_for_program[serial_tensor_id]
    def get_dist_op_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
        return self._dist_ops_for_graph.get(serial_op_node_id, None)
    def get_tensor_dist_attr_for_program(self, serial_tensor):
@@ -197,7 +202,7 @@ class DistributedContext:
        self.add_dist_tensor_for_program(dist_tensor)
    def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
        dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
                                                       None)
        if dist_tensor:
@@ -242,7 +247,7 @@ class DistributedContext:
        self.add_dist_op_for_program(dist_op)
    def get_op_dist_attr_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
        dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
        if dist_op:
            return dist_op.dist_attr
@@ -262,7 +267,7 @@ class DistributedContext:
    def get_dist_attr_for_graph(self, serial_node):
        if serial_node.is_var() and serial_node.var() is not None:
-            serial_tensor_node_id = serial_node.id()
+            serial_tensor_node_id = _node_id(serial_node)
            dist_tensor = self._dist_tensors_for_graph.get(
                serial_tensor_node_id, None)
            if dist_tensor:
@@ -270,7 +275,7 @@ class DistributedContext:
            else:
                return None
        if serial_node.is_op() and serial_node.op() is not None:
-            serial_op_node_id = serial_node.id()
+            serial_op_node_id = _node_id(serial_node)
            dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
            if dist_op:
                return dist_op.dist_attr
@@ -311,40 +316,69 @@ class DistributedContext:
    def order_nodes_by_program_order(self):
        def _contains(nodes, target_node):
            for node in nodes:
-                if node.id() == target_node.id():
+                if _node_id(node) == _node_id(target_node):
                    return True
            return False
-        ordered_tensor_nodes = []
+        serial_ordered_tensor_nodes = []
-        ordered_op_nodes = []
+        serial_ordered_op_nodes = []
-        all_nodes = self._serial_graph.all_nodes()
+        all_nodes = []
+        # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+        for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+            for node in graph.all_nodes():
+                all_nodes.append(node)
        for node in all_nodes:
            if node.is_var() and node.var() is not None:
-                ordered_tensor_nodes.append(node)
+                serial_ordered_tensor_nodes.append(node)
            if node.is_op() and node.op() is not None:
-                ordered_op_nodes.append(node)
+                serial_ordered_op_nodes.append(node)
-        ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
+        serial_ordered_tensor_nodes.sort(
-        ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id())
+            key=lambda node: node.node.original_desc_id())
-        for op_node in ordered_op_nodes:
+        serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        num_nodes_before = len(serial_ordered_tensor_nodes) + len(
+            serial_ordered_op_nodes)
+        new_serial_ordered_tensor_nodes = []
+        new_serial_ordered_op_nodes = []
+        for op_node in serial_ordered_op_nodes:
            tensor_nodes = []
            for tensor_node in op_node.inputs:
                if tensor_node.is_var() \
                    and tensor_node.var() is not None \
                    and not _contains(self._serial_ordered_nodes, tensor_node):
                    tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
            tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
            self._serial_ordered_nodes.extend(tensor_nodes)
            self._serial_ordered_nodes.append(op_node)
+            new_serial_ordered_op_nodes.append(op_node)
            tensor_nodes = []
            for tensor_node in op_node.outputs:
                if tensor_node.is_var() \
                    and tensor_node.var() is not None \
                    and not _contains(self._serial_ordered_nodes, tensor_node):
                    tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
+            tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
            self._serial_ordered_nodes.extend(tensor_nodes)
-        num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes)
+        new_serial_ordered_tensor_nodes.sort(
-        assert len(self._serial_ordered_nodes) == num_nodes_before, \
+            key=lambda node: node.node.original_desc_id())
-            "The number of nodes before ordering is not the same after ordering."
+        new_serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
+        self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
+        assert len(self._serial_ordered_nodes) == len(
+            self._serial_ordered_tensor_nodes) + len(
+                self._serial_ordered_op_nodes)
+        self._serial_orphan_tensor_nodes = []
+        for tensor_node in serial_ordered_tensor_nodes:
+            if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
+                self._serial_orphan_tensor_nodes.append(tensor_node)
+        if len(self._serial_ordered_nodes) != num_nodes_before:
+            print(
+                "WARNING: there are some orphan tensors or ops which are not used in the execution."
+            )
    def init_dist_attr_for_graph(self):
        assert self._is_initialized_for_program, \
@@ -352,9 +386,9 @@ class DistributedContext:
        if self._is_initialized_for_graph:
            return
        # Convert program to graph
+        set_flags({"FLAGS_convert_all_blocks": True})
        self._serial_graph = framework.IrGraph(
            core.Graph(self._serial_program.desc))
-        all_nodes = self._serial_graph.all_nodes()
        self.order_nodes_by_program_order()
        for node in self.serial_ordered_nodes:
            if node.is_var() and node.var() is not None:
@@ -365,10 +399,11 @@ class DistributedContext:
                    if tensor_id == cur_tensor_id \
                        or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
                        dist_tensor = cur_dist_tensor
-                        self._node_id_to_tensor_id[node.id()] = cur_tensor_id
+                        self._node_id_to_tensor_id[_node_id(
+                            node)] = cur_tensor_id
                assert dist_tensor is not None, \
                    "Tensor must have a distributed tensor after the initialization for program."
-                serial_tensor_node_id = node.id()
+                serial_tensor_node_id = _node_id(node)
                new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
                                                    dist_tensor.dist_attr)
                self._dist_tensors_for_graph[
@@ -381,10 +416,10 @@ class DistributedContext:
                    if op_id == cur_op_id \
                        or op_id == cur_dist_op.serial_op.desc.original_id():
                        dist_op = cur_dist_op
-                        self._node_id_to_op_id[node.id()] = cur_op_id
+                        self._node_id_to_op_id[_node_id(node)] = cur_op_id
                assert dist_op is not None, \
                    "Operator must have a distributed operator after the initialization for program."
-                serial_op_node_id = node.id()
+                serial_op_node_id = _node_id(node)
                new_dist_op = DistributedOperator(dist_op.serial_op,
                                                  dist_op.dist_attr)
                self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
@@ -402,10 +437,11 @@ class DistributedContext:
        assert self._is_initialized_for_program and self._is_initialized_for_graph, \
            "Both program and graph must be initialized."
        updated_tensors = {}
-        all_nodes = self._serial_graph.all_nodes()
+        # all_nodes = self._serial_graph.all_nodes()
+        all_nodes = self._serial_ordered_nodes
        for node in all_nodes:
            if node.is_var() and node.var() is not None:
-                tensor_id = self._node_id_to_tensor_id[node.id()]
+                tensor_id = self._node_id_to_tensor_id[_node_id(node)]
                updated = updated_tensors.get(tensor_id, False)
                # If a var has multiples var nodes in graph, only use the first one for now
                if not updated:
@@ -416,16 +452,31 @@ class DistributedContext:
                    dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
                    updated_tensors[tensor_id] = True
            if node.is_op() and node.op() is not None:
-                op_id = self._node_id_to_op_id[node.id()]
+                op_id = self._node_id_to_op_id[_node_id(node)]
                op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                dist_op_for_program = self._dist_ops_for_program[op_id]
                dist_op_for_program.dist_attr = op_dist_attr_for_graph
+        # TODO: the completion algorithm will skip orphan tensors, 
+        # here we just set there process_mesh to the first one.
+        for orphan_node in self._serial_orphan_tensor_nodes:
+            serial_tensor_id = orphan_node.var().id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
+            else:
+                serial_tensor_id = orphan_node.var().original_id()
+                dist_tensor = self._dist_tensors_for_program.get(
+                    serial_tensor_id, None)
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
    def amend_dist_attr_for_program(self):
        for dist_tensor in self._dist_tensors_for_program.values():
            serial_tensor = dist_tensor.serial_tensor
            dist_attr = dist_tensor.dist_attr
-            if serial_tensor.type == core.VarDesc.VarType.READER:
+            if serial_tensor.type == core.VarDesc.VarType.READER \
+                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                tensor_shape = []
            else:
                tensor_shape = serial_tensor.shape
@@ -446,6 +497,7 @@ class DistributedContext:
                    tensor_shape = []
                else:
                    if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
+                        or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
                        or dist_op.serial_op.type == "create_py_reader":
                        tensor_shape = []
                    else:
@@ -459,8 +511,9 @@ class DistributedContext:
                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                        dims_mapping[i] = -1
            for arg_name in serial_op.output_arg_names:
-                if dist_op.get_serial_output(
+                if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
-                        arg_name).type == core.VarDesc.VarType.READER:
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
                    tensor_shape = []
                else:
                    tensor_shape = dist_op.get_serial_output(arg_name).shape
@@ -498,7 +551,8 @@ class DistributedContext:
        for k, v in self.__dict__.items():
            if k == "_serial_program" or k == "_serial_graph" \
                or k == "_dist_main_programs" or k == "_dist_startup_programs" \
-                or k == "_serial_ordered_nodes":
+                or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \
+                or k == "_serial_ordered_op_nodes":
                setattr(result, k, v)
            else:
                setattr(result, k, copy.deepcopy(v, memo))

--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -76,7 +76,8 @@ class DistributedOperator:
            if tensor is None:
                tensor_shape = []
            else:
-                if tensor.type == core.VarDesc.VarType.READER:
+                if tensor.type == core.VarDesc.VarType.READER \
+                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
                    tensor_shape = []
                else:
                    tensor_shape = tensor.shape
@@ -86,7 +87,9 @@ class DistributedOperator:
                                                       tensor_dims_mapping)
        for tensor_name in self._serial_op.output_arg_names:
            tensor = self._serial_op.block._var_recursive(tensor_name)
-            if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if tensor.type == core.VarDesc.VarType.READER \
+                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                tensor_shape = []
            else:
                tensor_shape = tensor.shape
@@ -95,6 +98,8 @@ class DistributedOperator:
                tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
                self._dist_attr.set_output_dims_mapping(tensor_name,
                                                        tensor_dims_mapping)
+        if self._dist_attr.op_type is None:
+            self._dist_attr.op_type = self.serial_op.type
        if self._dist_attr.impl_type is None:
            self._dist_attr.impl_type = "default"
        if self._dist_attr.impl_idx is None:
@@ -134,11 +139,15 @@ class DistributedOperator:
        return new_dist_attr
    def validate_dist_attr(self):
-        if "read" in self.serial_op.type:
+        if "read" in self.serial_op.type or "while" == self.serial_op.type:
            return True
        for name in self.serial_op.input_arg_names:
            input_dist_attr = self.dist_attr.get_input_dist_attr(name)
            dims_mapping = input_dist_attr.dims_mapping
+            if self.get_serial_input(
+                    name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                shape = []
+            else:
                shape = self.get_serial_input(name).shape
            if len(shape) != len(dims_mapping):
                return False
@@ -155,6 +164,10 @@ class DistributedOperator:
        for name in self.serial_op.output_arg_names:
            output_dist_attr = self.dist_attr.get_output_dist_attr(name)
            dims_mapping = output_dist_attr.dims_mapping
+            if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
+                or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
+                shape = []
+            else:
                shape = self.get_serial_output(name).shape
            if len(shape) != len(dims_mapping):
                return False
@@ -241,14 +254,14 @@ class DistributedModule:
    def __call__(self, *args, **kwargs):
        from .dist_context import get_default_distributed_context
-        main_prog = paddle.fluid.default_main_program()
+        default_prog = paddle.fluid.default_main_program()
-        main_block = main_prog.global_block()
+        cur_block = default_prog.current_block()
-        op_size = len(main_block.ops)
+        op_size = len(cur_block.ops)
        output = self._serial_module(*args, **kwargs)
-        new_op_size = len(main_block.ops)
+        new_op_size = len(cur_block.ops)
        default_dist_ctx = get_default_distributed_context()
        for idx in range(op_size, new_op_size):
-            op = main_block.ops[idx]
+            op = cur_block.ops[idx]
            dist_op = DistributedOperator(op, self._dist_attr)
            dist_op.dist_attr.mark_annotated_as(self._dist_attr)
            default_dist_ctx.add_dist_op_for_program(dist_op)

--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -184,7 +184,9 @@ class DistributedTensor:
    def _init_default_dist_attr(self):
        if self._dist_attr.dims_mapping is None:
-            if self.serial_tensor.type == core.VarDesc.VarType.READER:
+            if self.serial_tensor.type == core.VarDesc.VarType.READER \
+                or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                tensor_shape = []
            else:
                tensor_shape = self._serial_tensor.shape
@@ -192,7 +194,9 @@ class DistributedTensor:
            self._dist_attr.dims_mapping = tensor_dims_mapping
    def validate_dist_attr(self):
-        if self.serial_tensor.type == core.VarDesc.VarType.READER:
+        if self.serial_tensor.type == core.VarDesc.VarType.READER \
+            or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
            return True
        tensor_shape = self.serial_tensor.shape
        if len(tensor_shape) != len(self.dist_attr.dims_mapping):

--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -259,7 +259,7 @@ class Engine:
                    "train_" + name: val
                    for name, val in logs.items()
                }
-                self._logger.info(logs)
+                self._logger.info(train_logs)
    def _train_step(self, data):
        logs = {}

--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute
 _g_distributed_operator_impl_containers = {}
-_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"]
+_g_elementwise_ops = [
+    "elementwise_add", "gelu", "dropout", "cast", "gather", "concat"
+]
 BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}

--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
        op_dist_attr = dist_op.dist_attr
        for arg_name in op_desc.input_arg_names():
            serial_tensor = dist_op.get_serial_input(arg_name)
-            if serial_tensor.is_parameter:
-                continue
            dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
            if len(dims_mapping) > 1:
                for mapping in dims_mapping[1:]:
                    if mapping != -1:
@@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
            xshape_arg_names = op_desc.output("XShape")
        for arg_name in op_desc.output_arg_names():
            serial_tensor = dist_op.get_serial_output(arg_name)
-            if serial_tensor.is_parameter:
-                continue
            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
            if arg_name not in xshape_arg_names:
                if len(dims_mapping) > 1:
                    for mapping in dims_mapping[1:]:
@@ -104,6 +114,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                for mapping in dims_mapping[1:]:
                    if mapping != -1:
                        return False
+            if len(dims_mapping) >= 1:
                batch_dim_mappings.append(dims_mapping[0])
        # Check output compatibility
@@ -121,6 +132,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                    for mapping in dims_mapping[1:]:
                        if mapping != -1:
                            return False
+                if len(dims_mapping) >= 1:
                    batch_dim_mappings.append(dims_mapping[0])
            else:
                if dims_mapping[0] != -1:
@@ -129,6 +141,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                    for mapping in dims_mapping[2:]:
                        if mapping != -1:
                            return False
+                if len(dims_mapping) >= 2:
                    batch_dim_mappings.append(dims_mapping[1])
        # Check batch dim mapping compatibility
@@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" or op_desc.type() == "slice":
+        if op_desc.type() == "shape" \
+            or op_desc.type() == "slice" \
+                or op_desc.type() == "while":
            return False
        output_names = op_desc.output_names()
        xshape_arg_names = []
@@ -155,6 +170,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
            if serial_tensor.is_parameter:
                continue
            dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if len(dims_mapping) >= 1:
                batch_dim_mappings.append(dims_mapping[0])
        for arg_name in op_desc.output_arg_names():
            serial_tensor = dist_op.get_serial_output(arg_name)
@@ -162,10 +178,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                continue
            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
            if arg_name not in xshape_arg_names:
+                if len(dims_mapping) >= 1:
                    batch_dim_mappings.append(dims_mapping[0])
            else:
                batch_dim_mappings.append(dims_mapping[1])
+        if not batch_dim_mappings:
+            return changed
        compatible_dim_mapping = compute_compatible_dim_mapping(
            batch_dim_mappings)
        assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
@@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
            if serial_tensor.is_parameter:
                continue
            dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if compatible_dim_mapping != dims_mapping[0]:
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                dims_mapping[0] = compatible_dim_mapping
                changed = True
        for arg_name in op_desc.output_arg_names():
@@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
                continue
            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
            if arg_name not in xshape_arg_names:
-                if compatible_dim_mapping != dims_mapping[0]:
+                if len(dims_mapping
+                       ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                    dims_mapping[0] = compatible_dim_mapping
                    changed = True
            else:
-                if compatible_dim_mapping != dims_mapping[1]:
+                if len(dims_mapping
+                       ) >= 2 and compatible_dim_mapping != dims_mapping[1]:
                    dims_mapping[1] = compatible_dim_mapping
                    changed = True

--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
        if is_valid_list_index(y_dims_mapping,
                               -2) and is_dim_shard(y_dims_mapping[-2]):
            return False
        return True
    def is_output_compatible(self, dist_op):

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
        used_dist_context._dist_op_context = DistributedOperatorContext()
        _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
            rank_id, used_dist_context)
-        # print("dist_main_program: ", dist_main_program)
        all_dist_main_program.append(dist_main_program)
    return all_dist_main_program

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -228,3 +228,5 @@ if core.is_compiled_with_npu():
 atexit.register(core.clear_executor_cache)
 # NOTE(Aganlengzi): clean up KernelFactory in advance manually.
 atexit.register(core.clear_kernel_factory)
+# NOTE(wangran16): clean up DeviceManger in advance manually.
+atexit.register(core.clear_device_manager)
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -173,6 +173,9 @@ if core.is_compiled_with_xpu():
 elif core.is_compiled_with_npu():
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'NPU', core.VarDesc.VarType.FP16)
+elif core.is_compiled_with_mlu():
+    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+        'MLU', core.VarDesc.VarType.FP16)
 else:
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'GPU', core.VarDesc.VarType.FP16)

--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -29,10 +29,11 @@ from .asp import decorate
 from .asp import prune_model
 from .asp import set_excluded_layers
 from .asp import reset_excluded_layers
+from .supported_layer_list import add_supported_layer
 __all__ = [
    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
    'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
-    'reset_excluded_layers'
+    'reset_excluded_layers', 'add_supported_layer'
 ]
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -23,6 +23,8 @@ import paddle
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
 from paddle.fluid import core
 OpRole = core.op_proto_and_checker_maker.OpRole
@@ -292,8 +294,8 @@ class ASPHelper(object):
    2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
    """
-    MASK_APPENDDED_NAME = '_asp_mask'
+    MASK_APPENDDED_NAME = 'asp_mask'
-    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+    PADDLE_WEIGHT_SUFFIX = "w_"
    __asp_info = {}
@@ -334,7 +336,6 @@ class ASPHelper(object):
        r"""
        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
        """
-        checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
        if main_program is None:
            main_program = paddle.static.default_main_program()
@@ -345,33 +346,27 @@ class ASPHelper(object):
                weight_tensor = global_scope().find_var(param.name).get_tensor()
                weight_nparray = np.array(weight_tensor)
-                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+                prune_func = ASPHelper._get_prune_func_by_name(param.name)
-                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
-                # cuSparseLt would prune matrix A along k dimension.
+                weight_pruned_nparray, weight_sparse_mask = \
-                # In sparse training, layer weight matriices is viewed sparse matrix A, so
+                    prune_func(weight_nparray, m, n, mask_algo, param.name)
-                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+                weight_pruned_nparray = weight_pruned_nparray.astype(
-                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+                    weight_nparray.dtype)
-                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
-                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
-                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
-                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
-                # sure its shape to be the same as the input weight.
-                weight_sparse_mask = sparsity.create_mask(
-                    weight_nparray.T, func_name=mask_algo, n=n, m=m).T
-                weight_pruned_nparray = np.multiply(weight_nparray,
-                                                    weight_sparse_mask)
                weight_tensor.set(weight_pruned_nparray, place)
-                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                        'Pruning {} weight matrix failure!!!'.format(param.name)
                if with_mask:
                    weight_mask_param = global_scope().find_var(
                        ASPHelper._get_mask_name(param.name))
                    assert weight_mask_param is not None, \
-                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        'Cannot find {} variable, please call optimizer.minimize (' \
+                        'paddle.sparsity.decorate(optimizer).minimize(loss)' \
                        ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
                    weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_sparse_mask = weight_sparse_mask.astype(
+                        np.array(weight_mask_tensor).dtype)
                    weight_mask_tensor.set(weight_sparse_mask, place)
                asp_info.update_masks(param.name, weight_sparse_mask)
        return asp_info.masks.copy()
    @staticmethod
@@ -384,7 +379,7 @@ class ASPHelper(object):
        Returns:
            string: The mask name of :attr:`param_name`.
        """
-        return param_name + ASPHelper.MASK_APPENDDED_NAME
+        return param_name + "." + ASPHelper.MASK_APPENDDED_NAME
    @staticmethod
    def _get_not_ASP_relevant_vars(main_program):
@@ -434,19 +429,46 @@ class ASPHelper(object):
              # fc_0.w_0 -> True
              # fc_0.b_0 -> False
        """
-        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+        param_name_list = param_name.split('.')
+        if ASPHelper.MASK_APPENDDED_NAME in param_name_list:
            return False
        for layer in cls._get_program_asp_info(main_program).excluded_layers:
            if layer in param_name:
                return False
-        for name in ASPHelper.SUPPORTED_LAYERS:
+        if param_name in supported_layers_and_prune_func_map:
-            if name in param_name and \
-               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
            return True
+        param_name_no_weight_suffix = param_name_list[0]
+        param_type_suffix = param_name_list[1]
+        layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix.
+                                                 rfind('_')]
+        if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix:
            return False
+        if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \
+            layer_name in supported_layers_and_prune_func_map:
+            return True
+        return False
+    @classmethod
+    def _get_prune_func_by_name(cls, param_name):
+        func = supported_layers_and_prune_func_map.get(param_name, None)
+        param_name_no_weight_suffix = param_name.split('.')[0]
+        if func is None:
+            func = supported_layers_and_prune_func_map.get(
+                param_name_no_weight_suffix, None)
+        if func is None:
+            layer_name = param_name_no_weight_suffix[:
+                                                     param_name_no_weight_suffix.
+                                                     rfind('_')]
+            func = supported_layers_and_prune_func_map.get(layer_name,
+                                                           _default_pruning)
+        return func
    @classmethod
    def _minimize(cls,
                  optimizer,
@@ -509,8 +531,7 @@ class ASPHelper(object):
                if ASPHelper._is_supported_layer(main_program,
                                                 param_and_grad[0].name):
                    mask_param = layers.create_parameter(
-                        name=param_and_grad[0].name +
+                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
-                        ASPHelper.MASK_APPENDDED_NAME,
                        shape=param_and_grad[0].shape,
                        dtype=param_and_grad[0].dtype,
                        default_initializer=ConstantInitializer(value=1.0))

--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from paddle.fluid.contrib import sparsity
+import threading
+__all__ = ['add_supported_layer']
+def _default_pruning(weight_nparray, m, n, func_name, param_name):
+    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+    # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+    # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+    # cuSparseLt would prune matrix A along k dimension.
+    # In sparse training, layer weight matrices is viewed sparse matrix A, so
+    # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+    # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+    # matrices beforce invoking create_mask. Then we transpose the result mask to make 
+    # sure its shape to be the same as the input weight.
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m).T
+    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
+    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    return weight_pruned_nparray, weight_sparse_mask
+# When value of given key in this DICT is None, 
+# ASP will call default pruning function in pruning stage.
+_supported_layers_and_prune_func_map_lock = threading.Lock()
+supported_layers_and_prune_func_map = {}
+def add_supported_layer(layer, pruning_func=None):
+    r"""
+    Add supported layers and its corresponding pruning function.
+    Args:
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
+        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+        its the corresponding pruning function.
+        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
+        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+        m, n, and func_name, please see `prune_model` for details.
+    """
+    name = None
+    if isinstance(layer, str):
+        name = layer
+    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            type(layer).__name__)
+    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            layer.__name__)
+    else:
+        assert "The type of layer should be string of Layer, but got {}!".format(
+            type(layer))
+    if pruning_func is None:
+        pruning_func = _default_pruning
+    _supported_layers_and_prune_func_map_lock.acquire()
+    supported_layers_and_prune_func_map.update({name: pruning_func})
+    _supported_layers_and_prune_func_map_lock.release()
+add_supported_layer('fc')
+add_supported_layer('linear')
+add_supported_layer('conv2d')
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                    self._rcvd_idx += 1
                    self._batches_outstanding -= 1
                else:
+                    # NOTE: when _rcvd_idx catch up _send_idx, which means
+                    #       one of following:
+                    #       1. all 2 * num_workers batches have been loaded
+                    #          and stored in _blocking_queue
+                    #       2. all data drained
+                    #       we need to let _thread blocking at _data_queue
+                    #       get_data to inoccupy CPU, otherwise may occupy
+                    #       CPU time for model running
                    # NOTE: in persistent workers mode, do not check data
                    #       drained here, simply let it go to _data_queue
                    #       reading to get _ResumeIteration
@@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                        #       may also be data in blocking queue
                        if self._batches_outstanding < len(self._places):
                            return None
-                        continue
            if self._rcvd_idx in self._task_infos and \
                    len(self._task_infos[self._rcvd_idx]) == 3:

--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -271,13 +271,14 @@ def amp_guard(enable=True,
            "current_tracer is None, maybe it is not in imperative mode.")
    # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
    # Maybe we will support cpu for bfloat16.
    if enable and not (tracer._expected_place.is_gpu_place() or
                       tracer._expected_place.is_xpu_place() or
+                       tracer._expected_place.is_mlu_place() or
                       tracer._expected_place.is_npu_place()):
        warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
            % tracer._expected_place)
        enable = False
    # For npu:
@@ -288,6 +289,10 @@ def amp_guard(enable=True,
    if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
        warnings.warn('XPUPlace only support float16 amp.')
        enable = False
+    # For mlu:
+    if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
+        warnings.warn('MLUPlace only support float16 amp.')
+        enable = False
    # For gpu float16: Compute Capability should >= 7.
    # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
    if tracer._expected_place.is_gpu_place():

--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -106,9 +106,10 @@ class AmpScaler(object):
        if enable and not (tracer._expected_place.is_gpu_place() or
                           tracer._expected_place.is_xpu_place() or
+                           tracer._expected_place.is_mlu_place() or
                           tracer._expected_place.is_npu_place()):
            warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
                % tracer._expected_place)
            enable = False

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                if dim_size == -1:
                    assert unk_dim_idx == -1, (
                        "Only one dimension value of 'shape' in reshape can "
-                        "be -1. But received shape[%d] is also -1." % dim_idx)
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx)
                    unk_dim_idx = dim_idx
                elif dim_size == 0:
                    assert dim_idx < len(x.shape), (

--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
+class MyOwnLayer(Layer):
+    def __init__(self):
+        super(MyOwnLayer, self).__init__()
+    def forward(self, x):
+        return x
+static_tensor = None
+static_tensor_mask = None
+def my_own_pruning(tensor, m, n, mask_algo, param_name):
+    global static_tensor
+    global static_tensor_mask
+    if static_tensor is None:
+        static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
+    if static_tensor_mask is None:
+        static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
+    return static_tensor, static_tensor_mask
+class TestASPAddSupportedLayer(unittest.TestCase):
+    def test_add_supported_layer_via_name(self):
+        sparsity.add_supported_layer("test_supported_1")
+        sparsity.add_supported_layer("test_supported_2", my_own_pruning)
+        sparsity.add_supported_layer(MyOwnLayer)
+        my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__)
+        self.assertTrue(
+            "test_supported_1" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"]
+                        == my_own_pruning)
+        self.assertTrue(
+            my_own_layer_name in supported_layers_and_prune_func_map)
+class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+        self.customer_prefix = "customer_layer"
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+            self.supported_layer_count_ref = 5
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
+    def test_inference_pruning(self):
+        self.exe.run(self.startup_program)
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=False)
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+        self.exe.run(self.startup_program)
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=True)
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                mat_mask = np.array(fluid.global_scope().find_var(
+                    sparsity.asp.ASPHelper._get_mask_name(param.name))
+                                    .get_tensor())
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
    set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
    py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+    py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
    py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
    set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)

--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+paddle.enable_static()
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+    return __reader__
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+        return out
+def loop_cond(i, loop_len, input_array):
+    return i < loop_len
+def loop_body(i, loop_len, input_array):
+    pre_input = paddle.tensor.array_read(array=input_array, i=i)
+    mlp_while0 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    mlp_while1 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    output = mlp_while0(pre_input)
+    cur_pred = mlp_while1(output)
+    # 更新循环条件
+    i = paddle.increment(x=i, value=1)
+    paddle.tensor.array_write(cur_pred, array=input_array, i=i)
+    return i, loop_len, input_array
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+        # 循环计数器
+        i = paddle.full(shape=[1], fill_value=0, dtype='int64')
+        # 循环次数
+        loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+        input_array = paddle.tensor.array_write(pred, i)
+        i, loop_len, input_array = static.nn.while_loop(
+            cond=loop_cond,
+            body=loop_body,
+            loop_vars=[i, loop_len, input_array])
+        end_pred = paddle.tensor.array_read(array=input_array, i=i)
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+    return train_program, start_program, dataloader, i, loss
+class TestMLP(unittest.TestCase):
+    def test_completer(self):
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = DistributedContext()
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
+        # print_program_with_dist_attr(complete_train_program, dist_context)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -123,17 +123,26 @@ class XPUOpTest(OpTest):
            return super().check_grad_with_place(
                place, inputs_to_check, output_names, no_grad_set,
                numeric_grad_delta, in_place, max_relative_error,
-                user_defined_grads, user_defined_grads, check_dygraph)
+                user_defined_grads, user_defined_grad_outputs, check_dygraph)
        a1 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
        a2 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
        a3 = self.get_grad_with_place(
            paddle.CPUPlace(),
            inputs_to_check,
            output_names,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
        self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                              "Gradient Check On two xpu")
        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
@@ -147,7 +156,7 @@ class XPUOpTest(OpTest):
                            numeric_grad_delta=0.005,
                            in_place=False,
                            max_relative_error=0.005,
-                            user_defined_grads=None,
+                            user_defined_grad_outputs=None,
                            check_dygraph=True):
        self.scope = core.Scope()
        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
@@ -197,6 +206,10 @@ class XPUOpTest(OpTest):
        if not type(output_names) is list:
            output_names = [output_names]
-        analytic_grads = self._get_gradient(inputs_to_check, place,
+        analytic_grads = self._get_gradient(
-                                            output_names, no_grad_set)
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
        return analytic_grads
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
            set(parameters),
            set([
                'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
-                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask',
-                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask',
-                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0',
                'fc_2.b_0_velocity_0'
            ]))
        self.assertEqual(ops, [

--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -333,6 +333,7 @@ class TestVariable(unittest.TestCase):
        with self.assertRaises(IndexError):
            res = x[[True, False, False]]
        with self.assertRaises(ValueError):
+            with paddle.static.program_guard(prog):
                res = x[[False, False]]
    def test_slice(self):

--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -59,16 +59,14 @@ class SGD(Optimizer):
        .. code-block:: python
            import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
            linear = paddle.nn.Linear(10, 10)
            inp = paddle.to_tensor(inp)
            out = linear(inp)
            loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
-            back = out.backward()
+            out.backward()
            sgd.step()
            sgd.clear_grad()