fix_conflict

184fa04c · wangruting · d636d72e · 60d1199a · 184fa04c · 184fa04c
218 changed file
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+[![Twitter](https://img.shields.io/badge/Twitter-1ca0f1.svg?logo=twitter&logoColor=white)](https://twitter.com/PaddlePaddle_)

 Welcome to the PaddlePaddle GitHub.


--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 # in case of low internet speed
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)

 set(WARPCTC_INCLUDE_DIR
    "${WARPCTC_INSTALL_DIR}/include"

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT)
      PARENT_SCOPE)
 endfunction()

-function(find_phi_register FILENAME ADD_PATH)
+function(find_phi_register FILENAME ADD_PATH PATTERN)
  # set op_name to OUTPUT
  set(options "")
  set(oneValueArgs "")
@@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH)
  string(
    REGEX
      MATCH
-      "PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
+      "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
      register
      "${CONTENT}")
  if(NOT register STREQUAL "")
-    string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}")
+    string(REPLACE "${PATTERN}(" "" register "${register}")
    string(REPLACE "," ";" register "${register}")
    string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
    string(REGEX REPLACE "//cuda_only" "" register "${register}")
@@ -401,7 +401,8 @@ function(op_library TARGET)
    # pybind USE_OP_ITSELF
    set(op_name "")
    # Add PHI Kernel Registry Message
-    find_phi_register(${cc_src} ${pybind_file})
+    find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
    find_register(${cc_src} "REGISTER_OPERATOR" op_name)
    if(NOT ${op_name} EQUAL "")
      file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
@@ -440,7 +441,8 @@ function(op_library TARGET)
  foreach(cu_src ${cu_srcs})
    set(op_name "")
    # Add PHI Kernel Registry Message
-    find_phi_register(${cu_src} ${pybind_file})
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
    find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
    if(NOT ${op_name} EQUAL "")
      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -446,7 +446,8 @@ endif()

 if(WITH_DISTRIBUTE
   AND NOT WITH_PSLIB
-   AND NOT WITH_PSCORE)
+   AND NOT WITH_PSCORE
+   AND NOT WITH_RPC)
  include(external/snappy)
  list(APPEND third_party_deps extern_snappy)


--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -16,6 +16,7 @@

 #include "paddle/fluid/distributed/collective/bkcl_tools.h"
 #include "paddle/fluid/distributed/collective/common.h"
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/phi/core/device_context.h"
@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() {
  PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end());
 }

+std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
+    phi::DenseTensor* tensor,
+    int src_rank,
+    int64_t offset,
+    int64_t numel,
+    bool sync_op,
+    bool use_calc_stream) {
+  // numel > 0 indicates the tensor need to be sliced
+  phi::DenseTensor partial_tensor;
+  if (numel > 0) {
+    partial_tensor = GetPartialTensor(*tensor, offset, numel);
+    tensor = &partial_tensor;
+  }
+
+  return Collective(
+      tensor,
+      // have to pass a tensor here
+      // TODO(zhangxiaoci) catch up with nccl's api
+      *tensor,
+      [&](phi::DenseTensor* output,
+          const phi::DenseTensor& input,
+          BKCLContext_t comm,
+          const XPUStream& stream) {
+        return bkcl_recv(comm,
+                         output->data(),
+                         output->numel(),
+                         src_rank,
+                         platform::ToBKCLDataType(
+                             framework::TransToProtoVarType(output->type())),
+                         stream);
+      },
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
+    const phi::DenseTensor& tensor,
+    int dst_rank,
+    int64_t offset,
+    int64_t numel,
+    bool sync_op,
+    bool use_calc_stream) {
+  // numel > 0 indicates the tensor need to be sliced
+  const phi::DenseTensor& tensor_maybe_partial =
+      numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor;
+
+  return Collective(
+      nullptr,
+      tensor_maybe_partial,
+      [&](phi::DenseTensor* output,
+          const phi::DenseTensor& input,
+          BKCLContext_t comm,
+          const XPUStream& stream) {
+        return bkcl_send(comm,
+                         input.data(),
+                         input.numel(),
+                         dst_rank,
+                         platform::ToBKCLDataType(
+                             framework::TransToProtoVarType(input.type())),
+                         stream);
+      },
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroupBKCL::BKCLTask> ProcessGroupBKCL::CreateTask(
    const Place& place,
    int rank,
@@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place,
  BKCLContext_t bkcl_comm;
  BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id));
  comm_ctx->SetBkclContext(bkcl_comm);
+  // comm context creates a separate XPU stream for communication
+  comm_ctx->CreateStream();

  place_to_calc_ctx_[place_key] = calc_ctx;
  place_to_comm_ctx_[place_key] = std::move(comm_ctx);

--- a/paddle/fluid/distributed/collective/process_group_bkcl.h
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.h
@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
  phi::DeviceContext* GetDeviceContext(const Place& place,
                                       bool use_calc_stream) const override;

-  std::shared_ptr<ProcessGroup::Task> AllReduce(
+  std::shared_ptr<ProcessGroup::Task> AllGather(
      phi::DenseTensor* out_tensor,
      const phi::DenseTensor& in_tensor,
-      const AllreduceOptions& opts,
+      int64_t offset,  // for compatibility, no use now
+      int64_t numel,   // for compatibility, no use now
      bool sync_op,
      bool use_calc_stream) override;

-  std::shared_ptr<ProcessGroup::Task> Broadcast(
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
      phi::DenseTensor* out_tensor,
      const phi::DenseTensor& in_tensor,
-      const BroadcastOptions& opts,
+      const AllreduceOptions& opts,
      bool sync_op,
      bool use_calc_stream) override;

-  std::shared_ptr<ProcessGroup::Task> AllGather(
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
      phi::DenseTensor* out_tensor,
      const phi::DenseTensor& in_tensor,
-      int64_t offset,  // for compatibility, no use now
-      int64_t numel,   // for compatibility, no use now
+      const BroadcastOptions& opts,
      bool sync_op,
      bool use_calc_stream) override;

@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
                                             bool sync_op,
                                             bool use_calc_stream) override;

+  std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                           int src_rank,
+                                           int64_t offset,
+                                           int64_t numel,
+                                           bool sync_op,
+                                           bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
+                                           int dst_rank,
+                                           int64_t offset,
+                                           int64_t numel,
+                                           bool sync_op,
+                                           bool use_calc_stream) override;
+
  std::shared_ptr<ProcessGroup::Task> Barrier(
      const BarrierOptions& = BarrierOptions()) override;


--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() {
  // run_per_steps_, run_at_offset_
  // 4, 0 --> run at step 0, 4, 8, 12
  // 4, 3 --> run at step 3, 7, 11, 15
-  if ((step_ % run_per_steps_) == run_at_offset_) {
+  if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) {
    ComputeInterceptor::RunOps();
  }
 }
@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() {
 void AmplifierInterceptor::SendDataReadyToDownStream() {
  // run multi times, send ready one times to downstream, that is
  // input multi times, output one times
-  if (step_ % send_down_per_steps_ == 0) {
+  if (cur_scope_id_ % send_down_per_steps_ == 0) {
    ComputeInterceptor::SendDataReadyToDownStream();
  }
 }
@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() {
 void AmplifierInterceptor::ReplyCompletedToUpStream() {
  // run multi times, reply one times to upstream, that is
  // input one times, output multi times
-  if (step_ % reply_up_per_steps_ == 0) {
+  if (cur_scope_id_ % reply_up_per_steps_ == 0) {
    ComputeInterceptor::ReplyCompletedToUpStream();
  }
 }

--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace distributed {

-class AmplifierInterceptor : public ComputeInterceptor {
+class AmplifierInterceptor final : public ComputeInterceptor {
 public:
  AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);


--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -71,6 +71,9 @@ void Carrier::Init(
    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
    CopyParameters(i, program, inference_root_scope_vars);
  }
+  // Add source and sink interceptor id to rank
+  interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
+  interceptor_id_to_rank_.emplace(SINK_ID, rank);

  // TODO(fleet_exe dev): thread pool
  thread_num_ = 1;
@@ -159,16 +162,10 @@ void Carrier::Start() {
                    true,
                    platform::errors::PreconditionNotMet(
                        "Using carrier before initialized."));
-  for (int64_t id : source_interceptor_ids_) {
-    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
-            << ".";
  InterceptorMessage start_msg;
-    // source node data_is_ready is send by carrier, so set src_id=-1
-    start_msg.set_src_id(-1);
-    start_msg.set_dst_id(id);
-    start_msg.set_message_type(DATA_IS_READY);
+  start_msg.set_dst_id(SOURCE_ID);
+  start_msg.set_message_type(START);
  Send(start_msg);
-  }
  // TODO(wangxi): async step
  Wait();
  dev_ctx_->Wait();
@@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() {

  auto gc = GetGC(place_);

+  // create source and sink task node
+  auto max_run_times = microbatch_scopes_.size();
+  TaskNode* source = new TaskNode(
+      rank_, SOURCE_ID, max_run_times);  // rank, task_id, max_run_times
+  TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times);
+  // find nodes without upstreams or without downstreams
+  std::vector<TaskNode*> origin_sources, origin_sinks;
+  for (const auto& item : interceptor_id_to_node_) {
+    TaskNode* task_node = item.second;
+    if (task_node->upstream().empty()) {
+      origin_sources.emplace_back(task_node);
+    }
+    if (task_node->downstream().empty()) {
+      origin_sinks.emplace_back(task_node);
+    }
+  }
+  // link source node with origin source
+  for (const auto& node : origin_sources) {
+    source->AddDownstreamTask(node->task_id(),
+                              std::numeric_limits<int64_t>::max());
+    node->AddUpstreamTask(SOURCE_ID, std::numeric_limits<int64_t>::max());
+  }
+  // link sink node with origin sink
+  for (const auto& node : origin_sinks) {
+    sink->AddUpstreamTask(node->task_id(), std::numeric_limits<int64_t>::max());
+    node->AddDownstreamTask(SINK_ID, std::numeric_limits<int64_t>::max());
+  }
+  // create source and sink interceptor
+  SetInterceptor(SOURCE_ID,
+                 InterceptorFactory::Create("Source", SOURCE_ID, source));
+  SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink));
+
  // create each Interceptor
  // no auto init since there is no config
  for (const auto& item : interceptor_id_to_node_) {
@@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() {
    VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
            << " with type: " << task_node->type() << ".";

-    if (task_node->upstream().empty()) {
-      source_interceptor_ids_.emplace_back(interceptor_id);
-    }
+    PADDLE_ENFORCE_EQ(
+        task_node->upstream().empty(),
+        false,
+        platform::errors::PreconditionNotMet(
+            "There should not have normal nodes as source nodes"));
+    PADDLE_ENFORCE_EQ(task_node->downstream().empty(),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "There should not have normal nodes as sink nodes"));
  }
 }


--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -100,8 +100,6 @@ class Carrier final {
  std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
      interceptor_idx_to_interceptor_;

-  std::vector<int64_t> source_interceptor_ids_;
-
  bool is_init_{false};

  std::mutex running_mutex_;

--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() {

  for (auto up : upstream) {
    in_readys_.emplace(up.first, std::make_pair(up.second, 0));
-    in_stops_.emplace(up.first, false);
  }
  for (auto down : downstream) {
    out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
  }
-
-  // source compute node, should we add a new SourceInterceptor?
-  if (upstream.empty()) {
-    is_source_ = true;
-    PADDLE_ENFORCE_GT(node_->max_run_times(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Source ComputeInterceptor must run at least one "
-                          "times, but now max_run_times=%ld",
-                          node_->max_run_times()));
-    in_readys_.emplace(-1,
-                       std::make_pair(std::numeric_limits<int64_t>::max(), 0));
-  }
-
-  // If there is no downstream or every downstream is in different rank,
-  // then this interceptor is the last one for current rank.
-  // This can be get during init, can be cached for later use.
-  is_last_ = downstream.empty();
 }

 void ComputeInterceptor::IncreaseReady(int64_t up_id) {
@@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
                    platform::errors::NotFound(
                        "Cannot find upstream=%lld in in_readys.", up_id));

-  // source node has no upstream, data_is_ready is send by carrier or others
-  if (is_source_ && up_id == -1) {
-    it->second.second += GetTaskNode()->max_run_times();
-    return;
-  }
-
  auto max_ready_size = it->second.first;
  auto ready_size = it->second.second;
  ready_size += 1;
@@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
    ready_msg.set_message_type(DATA_IS_READY);
    VLOG(3) << "ComputeInterceptor " << interceptor_id_
            << " Send data_is_ready msg to " << down_id
-            << " for step: " << step_;
+            << " in scope: " << cur_scope_id_;
    Send(down_id, ready_msg);
  }
 }
@@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {

    VLOG(3) << "ComputeInterceptor " << interceptor_id_
            << " Reply data_is_useless msg to " << up_id
-            << " for step: " << step_;
-    if (is_source_ && up_id == -1) return;
+            << " in scope: " << cur_scope_id_;

    InterceptorMessage reply_msg;
    reply_msg.set_message_type(DATA_IS_USELESS);
@@ -183,13 +157,17 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }

 void ComputeInterceptor::RunOps() {
-  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
-          << step_ + 1 << " time.";
  for (auto op : node_->ops()) {
-    op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+    PADDLE_ENFORCE_LT(cur_scope_id_,
+                      microbatch_scopes_.size(),
+                      platform::errors::InvalidArgument(
+                          "Step out of range. There are %ld "
+                          "microbatch_scopes, but recevice scope index %ld",
+                          microbatch_scopes_.size(),
+                          cur_scope_id_));
+    op->Run(*microbatch_scopes_[cur_scope_id_], place_);
    if (gc_) {
-      framework::DeleteUnusedTensors(
-          *microbatch_scopes_[step_ % node_->max_run_times()],
+      framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
                                     op,
                                     node_->unused_vars(),
                                     gc_.get());
@@ -201,77 +179,28 @@ void ComputeInterceptor::Run() {
  while (IsInputReady() && CanWriteOutput()) {
    VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";

+    // get the ready scope id from queue
+    cur_scope_id_ = ready_queue_.front();
+    ready_queue_.pop();
+
    RunOps();
-    ++step_;

    // send to downstream and increase buff used
    SendDataReadyToDownStream();
    // reply to upstream and decrease ready data
    ReplyCompletedToUpStream();
-    // Try to stop Carrier
-    if (is_last_ && (step_ % node_->max_run_times() == 0)) {
-      VLOG(3) << "Interceptor " << GetInterceptorId()
-              << " is stopping carrier.";
-      // FIXME(wangxi): with multi sink interceptor
-      StopCarrier();
-    }
  }
 }

-void ComputeInterceptor::ReceivedStop(int64_t up_id) {
-  received_stop_ = true;
-
-  // source node has no upstream, stop is send by carrier or others
-  if (is_source_ && up_id == -1) return;
-
-  auto it = in_stops_.find(up_id);
-  PADDLE_ENFORCE_NE(it,
-                    in_stops_.end(),
-                    platform::errors::NotFound(
-                        "Cannot find upstream=%lld in in_stops.", up_id));
-  PADDLE_ENFORCE_EQ(
-      it->second,
-      false,
-      platform::errors::AlreadyExists("Already received stop from %lld, stop "
-                                      "cannot be send more than once."));
-  it->second = true;
-}
-
-void ComputeInterceptor::TryStop() {
-  if (!received_stop_) return;
-
-  // can stop only when all upstream is stop and
-  // downstream complete
-  for (auto& in_stop : in_stops_) {
-    if (!in_stop.second) return;
-  }
-  for (auto& out_buff : out_buffs_) {
-    auto used_size = out_buff.second.second;
-    if (used_size != 0) return;
-  }
-
-  // send stop to downstream
-  for (auto& out : out_buffs_) {
-    auto down_id = out.first;
-    InterceptorMessage stop;
-    stop.set_message_type(STOP);
-    Send(down_id, stop);
-  }
-  stop_ = true;
-}
-
 void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
  if (msg.message_type() == DATA_IS_READY) {
    IncreaseReady(msg.src_id());
+    ready_queue_.push(msg.scope_idx());
    Run();
  } else if (msg.message_type() == DATA_IS_USELESS) {
    DecreaseBuff(msg.src_id());
    Run();
-  } else if (msg.message_type() == STOP) {
-    ReceivedStop(msg.src_id());
  }
-
-  TryStop();
 }

 REGISTER_INTERCEPTOR(Compute, ComputeInterceptor);

--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <queue>
 #include <utility>

 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor {
  virtual void SendDataReadyToDownStream();
  virtual void ReplyCompletedToUpStream();

-  int64_t step_{0};
+  std::queue<int64_t> ready_queue_;
+  int64_t cur_scope_id_;

 private:
  void PrepareDeps();
@@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor {
  void Run();
  void Compute(const InterceptorMessage& msg);

-  void ReceivedStop(int64_t up_id);
-  void TryStop();
-
-  bool is_source_{false};
-  bool is_last_{false};
-
  // upstream_id-->(max_ready_size, ready_size)
  std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
  // downstream_id-->(max_buffer_size, used_size)
  std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
-
-  bool received_stop_{false};
-  std::map<int64_t, bool> in_stops_{};
 };

 }  // namespace distributed

--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -93,7 +93,6 @@ class Interceptor {
  TaskNode* node_;

  // for stop
-  bool stop_{false};
  void StopCarrier();

  // for runtime
@@ -114,9 +113,6 @@ class Interceptor {

  std::mutex mutex_;
  std::deque<InterceptorMessage> messages_;
-
-  int64_t already_run_times_{0};
-  int64_t used_slot_nums_{0};
 };

 class InterceptorFactory {

--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
 *   1. record the num of micro-step
 *   2. check whether to notify carrier the current step is finished
 */
-class SinkInterceptor : public Interceptor {
+class SinkInterceptor final : public Interceptor {
 public:
  SinkInterceptor(int64_t interceptor_id, TaskNode* node);


--- a/paddle/fluid/distributed/fleet_executor/source_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
 *   1. receive `start` message from carrier
 *   2. send num_of_steps `data_is_ready` message to downstream
 */
-class SourceInterceptor : public Interceptor {
+class SourceInterceptor final : public Interceptor {
 public:
  SourceInterceptor(int64_t interceptor_id, TaskNode* node);


--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -25,57 +25,42 @@ limitations under the License. */
 namespace paddle {
 namespace distributed {

-class StartInterceptor : public Interceptor {
- public:
-  StartInterceptor(int64_t interceptor_id, TaskNode* node)
-      : Interceptor(interceptor_id, node) {
-    RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
-  }
-
-  void NOP(const InterceptorMessage& msg) {
-    if (msg.message_type() == STOP) {
-      stop_ = true;
-      InterceptorMessage stop;
-      stop.set_message_type(STOP);
-      Send(1, stop);  // stop 1, compute
-      return;
-    }
-    std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
-              << std::endl;
-  }
-};
-
 TEST(ComputeInterceptor, Compute) {
  std::string carrier_id = "0";
  Carrier* carrier =
      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});

  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");

  // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 3);  // rank, task_id, max_run_times
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);
  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
+  TaskNode* sink = new TaskNode(0, SINK_ID, 3);

-  // a->b->c
+  // source->a->b->sink
+  source->AddDownstreamTask(0);
+  node_a->AddUpstreamTask(SOURCE_ID);
  node_a->AddDownstreamTask(1, 3);
  node_b->AddUpstreamTask(0, 3);
-  node_b->AddDownstreamTask(2);
-  node_c->AddUpstreamTask(1);
+  node_b->AddDownstreamTask(SINK_ID);
+  sink->AddUpstreamTask(1);

-  Interceptor* a =
-      carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
+  carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
  carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
-  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));

+  // start
  InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  // test run three times
-  a->Send(1, msg);
-  a->Send(1, msg);
-  a->Send(1, msg);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
+  carrier->EnqueueInterceptorMessage(msg);

  carrier->Wait();
  carrier->Release();

--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor {

  void PingPong(const InterceptorMessage& msg) {
    if (msg.message_type() == STOP) {
-      stop_ = true;
      return;
    }
    std::cout << GetInterceptorId() << " recv msg, count=" << count_

--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor {

  void PingPong(const InterceptorMessage& msg) {
    if (msg.message_type() == STOP) {
-      stop_ = true;
      StopCarrier();
      return;
    }

--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -22,6 +22,10 @@ if(WITH_ROCM)
  target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
 endif()

+if(WITH_CINN)
+  target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
+endif()
+
 # Prepare file structure
 message(
  "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope,
 }

 void SetFeedVariable(Scope* scope,
-                     const Strings& input,
+                     const std::vector<std::string>& input,
                     const std::string& var_name,
                     size_t index) {
  // If var_name Variable is not found in GlobalScope, a new variable will
@@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope,
    feed_inputs.resize(index + 1);
  }
  // shared data with input tensor
-  feed_inputs[index] = input;
+  feed_inputs[index] = Strings(input);
 }

 FetchType& GetFetchVariable(const Scope& scope,

--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope,
                     size_t index);

 void SetFeedVariable(Scope* scope,
-                     const Strings& input,
+                     const std::vector<std::string>& input,
                     const std::string& var_name,
                     size_t index);


--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -19,12 +19,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/string_array.h"
+#include "paddle/phi/core/extended_tensor.h"

 namespace paddle {
 namespace framework {
 using FeedType =
    paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
-using FeedList = std::vector<FeedType>;
+
+using FeedList = paddle::framework::PhiVector<FeedType>;

 using FetchType = paddle::variant<phi::DenseTensor,
                                  LoDTensorArray,

--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -117,6 +117,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
    return var_type == proto::VarType::SPARSE_COO;
  }

+  bool IsSparseCooTensorOutput(const std::string& name) const override {
+    auto var_types = ctx_.GetOutputsVarType(name);
+    return std::all_of(var_types.begin(),
+                       var_types.end(),
+                       [](const proto::VarType::Type& type) {
+                         return type == proto::VarType::SPARSE_COO;
+                       });
+  }
+
  bool IsSparseCsrTensorInput(const std::string& name) const override {
    auto var_type = ctx_.GetInputVarType(name);
    return var_type == proto::VarType::SPARSE_CSR;

--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -126,6 +126,45 @@ void InferShapeUtilsTestKernel(const Context& dev_ctx,
  VLOG(6) << "Come into InferShapeUtilsTestKernel";
 }

+void TestOutputInferMeta(const phi::MetaTensor& x, phi::MetaTensor* out) {
+  ASSERT_EQ(x.dtype(), phi::DataType::FLOAT32);
+}
+
+class InferShapeUtilsTestOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddComment("This is test op");
+  }
+};
+
+class InferShapeUtilsTestOutputOp : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+  phi::KernelKey GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return phi::KernelKey(proto::VarType::FP32, ctx.GetPlace());
+  }
+};
+
+phi::KernelSignature TestSparseOutputOpArgumentMapping(
+    const phi::ArgumentMappingContext& ctx) {
+  if (ctx.IsSparseCooTensorOutput("Out")) {
+    return phi::KernelSignature(
+        "test_sparse_coo_tensor_output", {"X"}, {}, {"Out"});
+  }
+  return phi::KernelSignature("test_output", {"X"}, {}, {"Out"});
+}
+
+template <typename T, typename Context>
+void InferShapeUtilsTestOutputKernel(const Context& dev_ctx,
+                                     const phi::DenseTensor& x,
+                                     phi::SparseCooTensor* out) {
+  VLOG(6) << "Come into InferShapeUtilsTestOutputKernel";
+}
+
 }  // namespace framework
 }  // namespace paddle

@@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestKernel,
                   int) {}

+DECLARE_INFER_SHAPE_FUNCTOR(
+    infer_shape_utils_test_output,
+    InferShapeUtilsTestOutputInferShapeFunctor,
+    PD_INFER_META(paddle::framework::TestOutputInferMeta));
+REGISTER_OPERATOR(infer_shape_utils_test_output,
+                  paddle::framework::InferShapeUtilsTestOutputOp,
+                  paddle::framework::InferShapeUtilsTestOutputOpMaker,
+                  InferShapeUtilsTestOutputInferShapeFunctor);
+
+PD_REGISTER_KERNEL(test_sparse_coo_tensor_output,
+                   CPU,
+                   ALL_LAYOUT,
+                   paddle::framework::InferShapeUtilsTestOutputKernel,
+                   int) {}
+
 TEST(InferShapeUtilsTest, ALL) {
  paddle::framework::ProgramDesc prog;
  paddle::framework::proto::BlockDesc proto_block;
@@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) {

  op->InferShape(block_desc);
 }
+
+TEST(InferShapeUtilsTestOutput, ALL) {
+  paddle::framework::ProgramDesc prog;
+  paddle::framework::proto::BlockDesc proto_block;
+  paddle::framework::BlockDesc block_desc(&prog, &proto_block);
+
+  auto* op = block_desc.AppendOp();
+  op->SetType("infer_shape_utils_test_output");
+
+  auto* x = block_desc.Var("x");
+  x->SetType(paddle::framework::proto::VarType::LOD_TENSOR);
+  x->SetDataType(paddle::framework::proto::VarType::FP32);
+  op->SetInput("X", {"x"});
+
+  auto* out = block_desc.Var("out");
+  out->SetType(paddle::framework::proto::VarType::SPARSE_COO);
+  op->SetOutput("Out", {"out"});
+
+  phi::OpUtilsMap::Instance().InsertArgumentMappingFn(
+      "infer_shape_utils_test_output",
+      paddle::framework::TestSparseOutputOpArgumentMapping);
+
+  op->InferShape(block_desc);
+}
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -460,14 +460,6 @@ if(WITH_MKLDNN)
    test_cpu_quantize_squash_pass
    SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
    DEPS cpu_quantize_squash_pass naive_executor)
-  cc_test(
-    test_reshape_transpose_matmul_mkldnn_fuse_pass
-    SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
-    DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
-  cc_test(
-    test_matmul_transpose_reshape_fuse_pass
-    SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
-    DEPS matmul_transpose_reshape_mkldnn_fuse_pass)
  cc_test(
    test_shuffle_channel_mkldnn_detect_pass
    SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc

--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc *prog,
-           const std::string &type,
-           const std::vector<std::string> &inputs,
-           const std::vector<std::string> &outputs) {
-  auto *op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetInput("X", {inputs[0]});
-  op->SetOutput("Out", {outputs[0]});
-  if (type == "transpose2") {
-    op->SetAttr("axis", std::vector<int>({0, 2, 1, 3}));
-    op->SetOutput("XShape", {outputs[1]});
-  }
-  if (type == "reshape2") {
-    op->SetAttr("shape", std::vector<int>({4, 5, 6}));
-    op->SetOutput("XShape", {outputs[1]});
-  }
-
-  if (type == "matmul") {
-    op->SetInput("Y", {inputs[1]});
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("alpha", 1.0f);
-    op->SetAttr("transpose_X", true);
-    op->SetAttr("transpose_Y", true);
-  }
-  if (type == "matmul_v2") {
-    op->SetInput("Y", {inputs[1]});
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("trans_x", true);
-    op->SetAttr("trans_y", true);
-  }
-}
-
-ProgramDesc BuildProgramDesc(const std::string &op_name) {
-  ProgramDesc prog;
-  for (auto &v : std::initializer_list<std::string>(
-           {"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) {
-    auto *var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-  }
-
-  SetOp(&prog, op_name, {"a1", "a2"}, {"b"});
-  SetOp(&prog, "transpose2", {"b"}, {"c", "cx"});
-  SetOp(&prog, "reshape2", {"c"}, {"d", "dx"});
-  SetOp(&prog, "fc", {"d"}, {"e"});
-
-  return prog;
-}
-
-void MainTest(const ProgramDesc &prog, const std::string &op_name) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  int original_nodes_num = graph->Nodes().size();
-
-  auto pass =
-      PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-  EXPECT_EQ(original_nodes_num - 6, current_nodes_num);
-
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto *op = node->Op();
-      if (op->Type() == op_name) {
-        EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_reshape_Out"),
-                  std::vector<int>({4, 5, 6}));
-        EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_transpose_Out"),
-                  std::vector<int>({0, 2, 1, 3}));
-      }
-    }
-  }
-}
-
-TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) {
-  auto prog = BuildProgramDesc("matmul");
-  MainTest(prog, "matmul");
-}
-
-TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) {
-  auto prog = BuildProgramDesc("matmul_v2");
-  MainTest(prog, "matmul_v2");
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass);
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void AddVarToScope(Scope* param_scope,
-                   const std::string& name,
-                   const DDim& dims) {
-  auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(phi::CPUPlace());
-}
-
-Scope* CreateParamScope() {
-  auto param_scope = new Scope();
-  AddVarToScope(param_scope, "w1", {768, 768});
-  AddVarToScope(param_scope, "bias1", {768});
-  AddVarToScope(param_scope, "w2", {768, 768});
-  AddVarToScope(param_scope, "bias2", {768});
-  return param_scope;
-}
-
-void TestMain(const std::string& op_name, bool with_xshapes) {
-  // inputs          operator          output
-  // -----------------------------------------------
-  //  a1,w1,bias1      fc          ->    b1
-  //  b1             reshape       ->    c1
-  //  c1            transpose      ->    d1
-  //  a2,w2,bias2      fc          ->    b2
-  //  b2             reshape       ->    c2
-  //  c2            transpose      ->    d2
-  // (d1, d2)        matmul(_v2)   ->    (...)
-  Layers layers;
-  auto* a1 = layers.data("a1", {-1, 128, 768});
-  auto* w1 = layers.data("w1", {768, 768}, true);
-  auto* bias1 = layers.data("bias1", {768}, true);
-  auto* b1 = layers.fc(a1, w1, bias1, 2);
-  b1->SetShape({-1, 128, 768});
-  auto* c1 = layers.reshape2(b1, {0, 0, 12, 64}, with_xshapes);
-  c1->SetShape({-1, 128, 12, 64});
-  auto* d1 = layers.transpose2(c1, {0, 2, 1, 3}, with_xshapes);
-  d1->SetShape({-1, 12, 128, 64});
-  auto* a2 = layers.data("a2", {-1, 128, 768});
-  auto* w2 = layers.data("w2", {768, 768}, true);
-  auto* bias2 = layers.data("bias2", {768}, true);
-  auto* b2 = layers.fc(a2, w2, bias2, 2);
-  b2->SetShape({-1, 128, 768});
-  auto* c2 = layers.reshape2(b2, {0, 0, 12, 64});
-  c2->SetShape({-1, 128, 12, 64});
-  auto* d2 = layers.transpose2(c2, {0, 2, 1, 3});
-  d2->SetShape({-1, 12, 128, 64});
-  if (op_name == "matmul_v2") {
-    layers.matmul_v2(d1, d2);
-  } else {
-    layers.matmul(d1, d2);
-  }
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  graph->Set("__param_scope__", CreateParamScope());
-
-  int num_reshape_nodes_before = GetNumOpNodes(graph, "reshape2");
-  int num_transpose_nodes_before = GetNumOpNodes(graph, "transpose2");
-  int total_nodes_before = graph->Nodes().size();
-  VLOG(3) << DebugString(graph);
-
-  auto pass =
-      PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2");
-  int num_transpose_nodes_after = GetNumOpNodes(graph, "transpose2");
-  int total_nodes_after = graph->Nodes().size();
-  VLOG(3) << DebugString(graph);
-
-  EXPECT_EQ(num_reshape_nodes_before, 2);
-  EXPECT_EQ(num_reshape_nodes_after, 0);
-  EXPECT_EQ(num_transpose_nodes_before, 2);
-  EXPECT_EQ(num_transpose_nodes_after, 0);
-  int removed = 8;  // 2* reshape, reshape_out, transpose, transpose_out
-  if (with_xshapes) removed += 2;  // transpose_xshape, reshape_xshape
-  EXPECT_EQ(total_nodes_before - removed, total_nodes_after);
-  auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op();
-
-  auto check = [&matmul_op_desc](std::string a) {
-    std::string shape_str = "fused_reshape_" + a;
-    auto shape = matmul_op_desc->GetAttrIfExists<std::vector<int>>(shape_str);
-    EXPECT_EQ(shape, (std::vector<int>{0, 0, 12, 64}));
-    std::string axis_str = "fused_transpose_" + a;
-    auto axis = matmul_op_desc->GetAttrIfExists<std::vector<int>>(axis_str);
-    EXPECT_EQ(axis, (std::vector<int>{0, 2, 1, 3}));
-  };
-  check("X");
-  check("Y");
-}
-
-TEST(ReshapeTransposeMatmulMkldnnFusePass,
-     both_matmul_inputs_reshape_transpose) {
-  TestMain("matmul", false);
-}
-
-TEST(ReshapeTransposeMatmulMkldnnFusePass,
-     both_matmul_inputs_reshape_transpose_one_with_xshapes) {
-  TestMain("matmul", true);
-}
-
-TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
-     both_matmulv2_inputs_reshape_transpose) {
-  TestMain("matmul_v2", false);
-}
-
-TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
-     both_matmulv2_inputs_reshape_transpose_one_with_xshapes) {
-  TestMain("matmul_v2", true);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass);
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -19,14 +19,27 @@
 #include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h"

 DECLARE_bool(fast_eager_deletion_mode);
+DECLARE_bool(new_executor_use_cuda_graph);

 namespace paddle {
 namespace framework {

 bool IsInterpretercoreFastGCEnabled() {
-  return memory::allocation::AllocatorFacade::Instance()
+  // When using cuda graph, fast GC must be used. Because
+  // `EventQuery` method in event GC cannot be used in
+  // cuda graph.
+  PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
+                                .IsStreamSafeCUDAAllocatorUsed() == false &&
+                        FLAGS_new_executor_use_cuda_graph,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "When FLAGS_new_executor_use_cuda_graph is true, "
+                        "IsStreamSafeCUDAAllocatorUsed must be true, but "
+                        "got false."));
+  return (memory::allocation::AllocatorFacade::Instance()
              .IsStreamSafeCUDAAllocatorUsed() &&
-         FLAGS_fast_eager_deletion_mode;
+          FLAGS_fast_eager_deletion_mode) ||
+         FLAGS_new_executor_use_cuda_graph;
 }

 InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -31,6 +31,7 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/device_manager.h"

 PADDLE_DEFINE_EXPORTED_bool(
@@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,

 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
+DECLARE_bool(new_executor_use_cuda_graph);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(sync_nccl_allreduce);
+#endif

 constexpr const char* kExceptionCaught = "ExceptionCaught";
 constexpr const char* kTaskCompletion = "TaskCompletion";
@@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
    }
    return lhs_prority > rhs_prority;
  };
+
+  PrepareForCUDAGraphCapture();
 }

 InterpreterCore::~InterpreterCore() {
@@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<phi::DenseTensor>& feed_tensors) {
  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);

  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
@@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names,
    const std::vector<phi::DenseTensor>& feed_tensors) {
  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);

 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
@@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run(
  // return Fetch Tensors
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
-    return std::move(*fetch_var->GetMutable<framework::FetchList>());
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
  } else {
    return {};
  }
@@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run(
 paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names, bool need_fetch) {
  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);

 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
@@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run(
      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
  if (fetch_var && need_fetch) {
-    return std::move(*fetch_var->GetMutable<framework::FetchList>());
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
  } else {
    return {};
  }
@@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() {
  }
 }

+void InterpreterCore::PrepareForCUDAGraphCapture() {
+  if (!FLAGS_new_executor_use_cuda_graph) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(),
+      false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "when running the first batch."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
+  // which may cause error in cuda graph. This behavior is consistent with PE.
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  // All output vars of coalesce_tensor op should not be gc.
+  // If fused output var of coalesce_tensor is gc, it will cause accuracy
+  // problem. The specific reasons need to be analyzed.
+  for (auto& op_desc : block_.AllOps()) {
+    if (op_desc->Type() == kCoalesceTensor) {
+      for (auto& out_var_name : op_desc->OutputArgumentNames()) {
+        execution_config_.skip_gc_vars.insert(out_var_name);
+        VLOG(4) << "Insert Var(" << out_var_name << ") into skip_gc_vars.";
+      }
+    }
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
+void InterpreterCore::CheckCUDAGraphBeforeRun(
+    const std::vector<std::string>& feed_names) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        feed_names.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        FLAGS_new_executor_use_cuda_graph,
+        true,
+        platform::errors::InvalidArgument(
+            "You must turn on FLAGS_new_executor_use_cuda_graph to True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        place_,
+        platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+}
+
 void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, add next_instr_list to each instr,
  // and set the dependecy_count_

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -97,6 +97,10 @@ class InterpreterCore {
      const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
  void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);

+  // cuda graph
+  void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
+  void PrepareForCUDAGraphCapture();
+
  // execution
  void RunImpl();
  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->Info().infer_shape_(&infer_shape_ctx);
 }

+template <typename T>
+bool HasSameTensorType(phi::TensorBase* phi_tensor, Variable* var) {
+  if (phi_tensor == nullptr && var == nullptr) {
+    return true;
+  } else if (phi_tensor != nullptr && var != nullptr) {
+    if (T::classof(phi_tensor) && var->IsType<T>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// TODO(YuanRisheng): We need collect all `need_prepare_phi_data_`
+// into this function.
+void OperatorWithKernel::CheckWhetherPreparePhiData(
+    const VariableNameMap& innames,
+    const VariableNameMap& outnames,
+    const Scope& scope) const {
+  if (run_phi_kernel_ && impl_ != nullptr) {
+    const auto& phi_kernel_context = impl_->getKernelContext();
+    size_t phi_tensor_index = 0;
+    // Check each tensor in KernelContext, if there is a tensor that has
+    // different type with variable. The PhiKernelContext need be reconstructed.
+    // We use kernel_signature_'s output to retrieve tensor. Because the tensor
+    // in phi_kernel_context stored in the order of kernel_signature_'s output.
+    if (phi_kernel_context->OutputsSize() >= phi_tensor_index ||
+        kernel_signature_ == nullptr) {
+      need_prepare_phi_data_ = true;
+      return;
+    }
+
+    const auto& phi_output_names = kernel_signature_->output_names;
+    for (auto& phi_output_name : phi_output_names) {
+      const auto& iter = outnames.find(phi_output_name);
+      if (iter != outnames.end()) {
+        for (auto& var_name : iter->second) {
+          auto var_output = scope.FindVar(var_name);
+          auto phi_output =
+              phi_kernel_context->MutableOutputAt<phi::TensorBase>(
+                  phi_tensor_index);
+          if (phi_output == nullptr) {
+            continue;
+          }
+          if (!(HasSameTensorType<phi::DenseTensor>(phi_output, var_output) ||
+                HasSameTensorType<phi::SparseCooTensor>(phi_output,
+                                                        var_output) ||
+                HasSameTensorType<framework::Strings>(phi_output,
+                                                      var_output))) {
+            need_prepare_phi_data_ = true;
+          }
+          phi_tensor_index++;
+        }
+      }
+    }
+  }
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      HasAttr(kAllKernelsMustComputeRuntimeShape))
    all_kernels_must_compute_runtime_shape_ = true;
  const Scope* cur_scope = &scope;
+  CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
  if (!enable_cache_runtime_context_) {
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
@@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                        "to the size of kernel attribute_defs (%d).",
                        attr_names.size(),
                        attr_defs.size()));
-
  for (size_t i = 0; i < input_names.size(); ++i) {
    auto it = ctx.inputs.find(input_names[i]);

@@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
      } else if (var->IsType<framework::Vocab>()) {
        tensor_in = &(var->Get<framework::Vocab>());
        phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::FeedList>()) {
+        tensor_in = &(var->Get<framework::FeedList>());
+        phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported input `%s` type when call pt kernel.",
@@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
    phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
  VLOG(4) << "Done inputs";
-
  for (size_t i = 0; i < output_names.size(); ++i) {
    auto it = ctx.outputs.find(output_names[i]);
    size_t start_idx =
@@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
          // Note: If the input LoDTensorArray size is 0, the output
          // LoDTensorArray is also 0
          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::Strings>()) {
+          tensor_out = var->template GetMutable<framework::Strings>();
+          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
        } else if (var->template IsType<paddle::framework::RawTensor>()) {
          tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
@@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                                          i);
  }
  VLOG(4) << "Done outputs";
-
  for (size_t i = 0; i < attr_names.size(); ++i) {
    VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
            << attr_defs[i].type_index;

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
    return var->IsType<phi::SparseCooTensor>();
  }

+  bool IsSparseCooTensorOutput(const std::string& name) const override {
+    auto vars = ctx_.MultiOutputVar(name);
+    return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
+      return var->IsType<phi::SparseCooTensor>();
+    });
+  }
+
  bool IsSparseCsrTensorInput(const std::string& name) const override {
    const auto* var = ctx_.InputVar(name);
    return var->IsType<phi::SparseCsrTensor>();
@@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase {
                     RuntimeContext* ctx,
                     const phi::Place& place) const;

+  void CheckWhetherPreparePhiData(const VariableNameMap& innames,
+                                  const VariableNameMap& outnames,
+                                  const Scope& scope) const;
+
  void TransferInplaceVarsBack(const Scope& scope,
                               const std::vector<std::string>& inplace_vars,
                               const Scope& exec_scope) const;

--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -60,6 +60,7 @@ if(WITH_TESTING)
    elementwise_add_op
    generated_op)
  set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES})

  cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS
              transform_desc)

--- a/paddle/fluid/framework/raw_tensor.h
+++ b/paddle/fluid/framework/raw_tensor.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace framework {

 /// \brief Fluid Kernel and PHI Kernel will be unified in the future.
-/// So, we need a class in PHI that can represent the RAW type in Fluid.
-/// The RawTensor is for PHI Kernel that has RAW type arguments.
+/// So, we need a class in PHI that can represent the RawTensor type in Fluid.
+/// The RawTensor is for PHI Kernel that has RawTensor type arguments.
 class RawTensor : public phi::ExtendedTensor,
                  public phi::TypeInfoTraits<phi::TensorBase, RawTensor> {
 public:
@@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor,
  RawTensor& operator=(RawTensor&& other) = default;

  /// \brief Destroy the RawTensor and release exclusive resources.
-  virtual ~RawTensor() = default;
+  virtual ~RawTensor() {
+    if (!data_.empty()) {
+      data_deleter_();
+    }
+  }

 public:
  /// \brief Returns the name of the class for type traits.
  /// \return The name of the class.
  static const char* name() { return "RawTensor"; }

+  template <typename T>
+  T& Get() const {
+    PADDLE_ENFORCE_EQ(data_.empty(),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "The data in RawTensor is empty. Please set data "
+                          "before using it."));
+
+    try {
+      return *(paddle::any_cast<T*>(data_));
+    } catch (paddle::bad_any_cast&) {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Invalid data type error, expected %s, actual %s.",
+          typeid(T).name(),
+          data_type_.name()));
+    }
+  }
+
  template <typename T>
  T* GetMutable() {
    if (!data_.empty()) {
@@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor,

 private:
  paddle::any data_;
-  std::function<void(void)> data_deleter_;
+  std::function<void(void)> data_deleter_ = []() {};
  std::type_index data_type_ = std::type_index(typeid(void));
 };


--- a/paddle/fluid/framework/string_array.h
+++ b/paddle/fluid/framework/string_array.h
@@ -25,6 +25,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't
+// recommend widely use it. Because faster_tokenizer_op may be deleted in the
+// future and this class will be deleted.
+
 class Vocab : public phi::ExtendedTensor,
              public phi::TypeInfoTraits<phi::TensorBase, Vocab> {
 public:
@@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor,
  std::unordered_map<std::wstring, std::int32_t> data_;
 };

+// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI
+// Kernel. It can be used when you define a non-tensor type that needs to be
+// stored in a vector as PHI kernel argument.
+
+template <typename T>
+class PhiVector : public phi::ExtendedTensor,
+                  public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
+ public:
+  PhiVector() = default;
+
+  explicit PhiVector(const std::vector<T>& init_data) : data_(init_data) {}
+
+  PhiVector(PhiVector&& other) = default;
+
+  PhiVector(const PhiVector& other) = default;
+
+  PhiVector& operator=(const PhiVector& other) = default;
+
+  PhiVector& operator=(const std::vector<T>& other) {
+    data_ = other;
+    return *this;
+  }
+
+  PhiVector& operator=(PhiVector&& other) = default;
+
+  /// \brief Destroy the PhiVector and release exclusive resources.
+  virtual ~PhiVector() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() {
+    return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
+  }
+
+  size_t size() const { return data_.size(); }
+
+  void resize(size_t size) { data_.resize(size); }
+
+  void clear() { data_.clear(); }
+
+  void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); }
+
+  const T& operator[](size_t index) const { return data_[index]; }
+
+  T& operator[](size_t index) { return data_[index]; }
+
+  T& at(size_t index) { return data_.at(index); }
+
+  const T& at(size_t index) const { return data_.at(index); }
+
+  typename std::vector<T>::iterator begin() { return data_.begin(); }
+
+  typename std::vector<T>::const_iterator begin() const {
+    return data_.begin();
+  }
+
+  typename std::vector<T>::iterator end() { return data_.end(); }
+
+  typename std::vector<T>::const_iterator end() const { return data_.end(); }
+
+ private:
+  std::vector<T> data_;
+};
+
 using String = std::string;
-using Strings = std::vector<std::string>;
+using Strings = PhiVector<std::string>;

 // Convert the std::string type to the std::string type.
 bool ConvertStrToWstr(const std::string& src, std::wstring* res);

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
    Vocab,
    std::vector<int>,
    std::vector<float>,
+    std::vector<std::string>,
    RawTensor>;
 template <typename T>
 struct VarTypeTrait {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1655,7 +1655,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    auto custom_place = place_;
    auto paddleplace = static_cast<PaddlePlace>(
        static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+        phi::CustomRegisteredDeviceMap::Instance()
+            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
    res->SetPlace(paddleplace, custom_place.GetDeviceId());
  } else {
    auto gpu_place = place_;
@@ -1710,7 +1711,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
    auto custom_place = place_;
    auto paddleplace = static_cast<PaddlePlace>(
        static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+        phi::CustomRegisteredDeviceMap::Instance()
+            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
    res->SetPlace(paddleplace, custom_place.GetDeviceId());
  } else {
    auto gpu_place = place_;

--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME)
  cc_library(
    zero_copy_tensor_dummy
    SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime)
+    DEPS onnxruntime phi_enforce)
 else()
  cc_library(
    zero_copy_tensor
    SRCS zero_copy_tensor.cc
    DEPS scope lod_tensor enforce)
-  cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
+  cc_library(
+    zero_copy_tensor_dummy
+    SRCS zero_copy_tensor_dummy.cc
+    DEPS phi_enforce)
 endif()

 cc_test(

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) {
      var,
      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
-  paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
+  paddle::framework::Strings *tensor =
+      var->GetMutable<paddle::framework::Strings>();
  tensor->resize(shape);
 }

@@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) {
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    paddle::platform::CustomPlace custom_place(
-        phi::GetGlobalDeviceType(device_type_id), device_);
+        phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+            device_type_id),
+        device_);
    auto *t_data = tensor->mutable_data<T>(custom_place);
    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
        pool.Get(custom_place));
@@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data,
 }

 void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
-  EAGER_GET_TENSOR(paddle_infer::Strings);
+  EAGER_GET_TENSOR(paddle::framework::Strings);
  PADDLE_ENFORCE_GE(tensor->size(),
                    0,
                    paddle::platform::errors::PreconditionNotMet(

--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput(
    const std::string& name) const {
  return false;
 }
+
+bool PluginArgumentMappingContext::IsSparseCooTensorOutput(
+    const std::string& name) const {
+  return false;
+}
+
 bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
    const std::string& name) const {
  return false;

--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
@@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {

  bool IsDenseTensorOutput(const std::string& name) const override;

+  bool IsSparseCooTensorOutput(const std::string& name) const override;
+
  bool IsSelectedRowsOutput(const std::string& name) const override;

  bool IsForInferShape() const override { return false; }

--- a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
@@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) {

  EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
  EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
+  EXPECT_EQ(context.IsSparseCooTensorOutput("Out"), false);
  EXPECT_EQ(context.IsForInferShape(), false);
 }


--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -26,6 +26,8 @@
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"

+DECLARE_bool(use_shm_cache);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -111,20 +113,33 @@ void AllocateMemoryMap(
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
                                      int flags,
-                                      size_t size) {
+                                      size_t size,
+                                      int buffer_id) {
  int fd = -1;
  void *base_ptr = nullptr;
+  if (buffer_id == -1) {
    AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+    VLOG(4) << "Create and mmap a new shm: " << filename;
+  } else {
+    base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
+    VLOG(4) << "Get a cached shm " << filename;
+  }
  void *aliged_base_ptr =
      static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
  return std::make_shared<RefcountedMemoryMapAllocation>(
-      aliged_base_ptr, size, filename, flags, fd);
+      aliged_base_ptr, size, filename, flags, fd, buffer_id);
 }

 RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
-    void *ptr, size_t size, std::string ipc_name, int fd, int flags)
+    void *ptr,
+    size_t size,
+    std::string ipc_name,
+    int fd,
+    int flags,
+    int buffer_id)
    : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
  // must reset base ptr first.
+  buffer_id_ = buffer_id;
  resetBaseptr();
  initializeRefercount();
 }
@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() {
 }

 void RefcountedMemoryMapAllocation::close() {
+  VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_;
  if (closed_) {
    return;
  }
  closed_ = true;
  void *data = map_ptr_;
  CountInfo *info = reinterpret_cast<CountInfo *>(data);
-  if (--info->refcount == 0) {
+  --info->refcount;
+  if (FLAGS_use_shm_cache && buffer_id_ != -1) {
+    return;
+  } else {
+    if (FLAGS_use_shm_cache &&
+        MemoryMapAllocationPool::Instance().BufferSize() <
+            static_cast<size_t>(
+                MemoryMapAllocationPool::Instance().MaxPoolSize())) {
+      MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo(
+          flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_));
+    } else {
+      if (info->refcount == 0 &&
+          shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) {
        shm_unlink(ipc_name_.c_str());
        VLOG(6) << "shm_unlink file: " << ipc_name_;
      }

-  PADDLE_ENFORCE_NE(
-      munmap(map_ptr_, map_size_),
+      PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_),
                        -1,
-      platform::errors::Unavailable("could not unmap the shared memory file: ",
+                        platform::errors::Unavailable(
+                            "could not unmap the shared memory file: ",
                            strerror(errno),
                            " (",
                            errno,
                            ")"));
+    }
+  }
 }

 MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() {

 MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }

+MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr;
+
+void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  memory_map_allocations_.push_back(memory_map);
+  VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_;
+}
+
+int MemoryMapAllocationPool::FindFromCache(const int &flag,
+                                           const size_t &data_size,
+                                           const std::string &file_name,
+                                           bool check_refcount) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) {
+    if (memory_map_allocations_.at(idx).flags_ == flag &&
+        memory_map_allocations_.at(idx).data_size_ == data_size) {
+      if (file_name == "" ||
+          memory_map_allocations_.at(idx).file_name_ == file_name) {
+        if (!check_refcount || reinterpret_cast<CountInfo *>(
+                                   memory_map_allocations_.at(idx).mmap_ptr_)
+                                       ->refcount == 0) {
+          VLOG(4) << "Match at: " << idx;
+          return idx;
+        }
+      }
+    }
+  }
+  return -1;
+}
+
+const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  return memory_map_allocations_.at(id);
+}
+
+void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
+  max_pool_size_ = size;
+  VLOG(4) << this << "Set max pool size is: " << max_pool_size_;
+}
+
+void MemoryMapAllocationPool::Clear() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto mmap : memory_map_allocations_) {
+    int rlt = shm_unlink(mmap.file_name_.c_str());
+    if (rlt == 0) {
+      VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
+    }
+    PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment),
+                      -1,
+                      platform::errors::Unavailable(
+                          "could not unmap the shared memory file: ",
+                          strerror(errno),
+                          " (",
+                          errno,
+                          ")"));
+  }
+  memory_map_allocations_.clear();
+}
+
+MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle

--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation {

 class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
 public:
-  RefcountedMemoryMapAllocation(
-      void *ptr, size_t size, std::string ipc_name, int flags, int fd);
+  RefcountedMemoryMapAllocation(void *ptr,
+                                size_t size,
+                                std::string ipc_name,
+                                int flags,
+                                int fd,
+                                int buffer_id = -1);

  void incref();
  int decref();
@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
  virtual ~RefcountedMemoryMapAllocation() { close(); }

 protected:
+  int buffer_id_ = -1;
  void initializeRefercount();
  void resetBaseptr();
 };
@@ -94,7 +99,8 @@ void AllocateMemoryMap(
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
                                      int flags,
-                                      size_t size);
+                                      size_t size,
+                                      int buffer_id = -1);

 class MemoryMapWriterAllocation : public Allocation {
 public:
@@ -153,6 +159,68 @@ class MemoryMapFdSet {
  std::mutex mtx_;
 };

+class MemoryMapInfo {
+ public:
+  explicit MemoryMapInfo(int flags,
+                         size_t data_size,
+                         std::string file_name,
+                         void *mmap_ptr)
+      : flags_(flags),
+        data_size_(data_size),
+        file_name_(file_name),
+        mmap_ptr_(mmap_ptr) {}
+
+  int flags_ = 0;
+  size_t data_size_ = 0;
+  std::string file_name_;
+  void *mmap_ptr_ = nullptr;
+};
+
+/* Note(zhangbo):
+MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in
+dataloader. The munmap(shm_mmap_ptr) instruction in
+RefcountedMemoryMapAllocation::close() function may block other threads of the
+process. Therefore, the logic of shm cache and reuse is designed: the shm
+created by the _share_filename process will be cached and reused according to
+the data_size of shm, thus eliminating the problem of munmap blocking other
+threads
+*/
+class MemoryMapAllocationPool {
+ public:
+  static MemoryMapAllocationPool &Instance() {
+    if (pool_ == nullptr) {
+      pool_ = new MemoryMapAllocationPool();
+    }
+    return *pool_;
+  }
+
+  void Insert(const MemoryMapInfo &memory_map);
+
+  int FindFromCache(const int &flag,
+                    const size_t &data_size,
+                    const std::string &file_name = "",
+                    bool check_refcount = true);
+
+  const MemoryMapInfo &GetById(int id);
+
+  size_t BufferSize() { return memory_map_allocations_.size(); }
+
+  void Clear();
+
+  void SetMaxPoolSize(const int &size);
+
+  int MaxPoolSize() { return max_pool_size_; }
+
+  ~MemoryMapAllocationPool();
+
+ private:
+  MemoryMapAllocationPool() = default;
+  static MemoryMapAllocationPool *pool_;
+  std::vector<MemoryMapInfo> memory_map_allocations_;
+  int max_pool_size_ = 0;
+  std::mutex mtx_;
+};
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle

--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -44,6 +44,7 @@ if(WITH_TESTING)
    cinn_launch_context
    cinn_instruction_run_op
    cinn)
+  target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
  set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
                                                           "RUN_TYPE=CINN")

@@ -73,6 +74,7 @@ if(WITH_TESTING)
    cinn_launch_op
    cinn_instruction_run_op
    elementwise_add_op)
+  target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
  set_tests_properties(
    cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
                                            "${CINN_RUN_ENVIRONMENT}")

--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -11,6 +11,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/raw_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"

 namespace paddle {
@@ -28,91 +30,8 @@ class OpBase;
 namespace paddle {
 namespace operators {

-// FeedVariableVisitor is to feed the variable data
-// according to data type (phi::DenseTensor or  Strings).
-class FeedVariableVisitor {
- public:
-  explicit FeedVariableVisitor(framework::Variable *out_var,
-                               const platform::Place &place)
-      : out_var_(out_var), place_(place) {}
-
-  void operator()(const phi::DenseTensor &in_tensor) const {
-    phi::DenseTensor *out_tensor = out_var_->GetMutable<phi::DenseTensor>();
-    if (platform::is_same_place(in_tensor.place(), place_)) {
-      out_tensor->ShareDataWith(in_tensor);
-#ifdef PADDLE_WITH_IPU
-    } else if (platform::is_ipu_place(place_)) {
-      // For ipu, both in_tensor and out_tensor are allocated on cpu,
-      // PopART will copy tensor from host automatically,
-      // no TensorCopy() is required here.
-      out_tensor->ShareDataWith(in_tensor);
-#endif
-    } else {
-      platform::DeviceContext *context =
-          platform::DeviceContextPool::Instance().Get(place_);
-      framework::TensorCopy(in_tensor, place_, *context, out_tensor);
-    }
-    out_tensor->set_lod(in_tensor.lod());
-  }
-
-  void operator()(const framework::Strings &in_str) const {
-    framework::Strings *out_str = out_var_->GetMutable<framework::Strings>();
-    out_str->resize(in_str.size());
-    *out_str = in_str;
-  }
-
-  void operator()(const phi::SparseCooTensor &in_tensor) const {
-    phi::SparseCooTensor *out_tensor =
-        out_var_->GetMutable<phi::SparseCooTensor>();
-    if (platform::is_same_place(in_tensor.place(), place_)) {
-      *out_tensor = in_tensor;
-    } else {
-      platform::DeviceContext *context =
-          platform::DeviceContextPool::Instance().Get(place_);
-
-      phi::DenseTensor indices, values;
-      framework::TensorCopy(in_tensor.indices(), place_, *context, &indices);
-      framework::TensorCopy(in_tensor.values(), place_, *context, &values);
-      out_tensor->SetMember(indices, values, in_tensor.meta());
-    }
-  }
-
- private:
-  framework::Variable *out_var_;
-  const platform::Place &place_;
-};
-
-class FeedOp : public framework::OperatorBase {
- public:
-  FeedOp(const std::string &type,
-         const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    OP_INOUT_CHECK(HasInputs("X"), "Input", "X", "Feed");
-    OP_INOUT_CHECK(HasOutputs("Out"), "Output", "Out", "Feed");
-
-    auto feed_var_name = Input("X");
-    auto *feed_var = scope.FindVar(feed_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        feed_var,
-        platform::errors::NotFound(
-            "Input varibale(%s) cannot be found in scope for operator 'Feed'.",
-            feed_var_name));
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Output variable(%s) cannot be found in scope for operator 'Feed'",
-            out_name));
-
-    auto col = Attr<int>("col");
+const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
+                                               int col) {
  PADDLE_ENFORCE_GE(col,
                    0,
                    platform::errors::InvalidArgument(
@@ -120,25 +39,119 @@ class FeedOp : public framework::OperatorBase {
                        "operator 'Feed') of current feeding variable to be "
                        "no less than 0. But received column index = %d.",
                        col));
-
-    VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
-            << " column to variable " << out_name;
-
-    auto &feed_list = feed_var->Get<framework::FeedList>();
+  auto feed_list = static_cast<const paddle::framework::FeedList*>(&x);
  PADDLE_ENFORCE_LT(
      static_cast<size_t>(col),
-        feed_list.size(),
+      feed_list->size(),
      platform::errors::InvalidArgument(
          "The column index of current feeding variable is expected to be "
          "less than the length of feeding list. But received column index = "
          "%d, the length of feeding list = %d",
          col,
-            feed_list.size()));
+          feed_list->size()));

-    auto &feed_item = feed_list.at(static_cast<size_t>(col));
+  return feed_list->at(static_cast<size_t>(col));
+}

-    FeedVariableVisitor visitor(out_var, place);
-    paddle::visit(visitor, feed_item);
+template <typename Context>
+void FeedDenseTensorKernel(const Context& dev_ctx,
+                           const phi::ExtendedTensor& x,
+                           int col,
+                           phi::DenseTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  const auto& in_tensor = paddle::get<phi::DenseTensor>(feed_item);
+  const auto& place = dev_ctx.GetPlace();
+  if (platform::is_same_place(in_tensor.place(), place)) {
+    out->ShareDataWith(in_tensor);
+  } else {
+    framework::TensorCopy(in_tensor, place, dev_ctx, out);
+  }
+
+  out->set_lod(in_tensor.lod());
+}
+
+template <typename Context>
+void FeedSparseCooTensorKernel(const Context& dev_ctx,
+                               const phi::ExtendedTensor& x,
+                               int col,
+                               phi::SparseCooTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  const auto& in_tensor = paddle::get<phi::SparseCooTensor>(feed_item);
+  const auto& place = dev_ctx.GetPlace();
+  if (platform::is_same_place(in_tensor.place(), place)) {
+    *out = in_tensor;
+  } else {
+    phi::DenseTensor indices, values;
+    framework::TensorCopy(in_tensor.indices(), place, dev_ctx, &indices);
+    framework::TensorCopy(in_tensor.values(), place, dev_ctx, &values);
+    out->SetMember(indices, values, in_tensor.meta());
+  }
+}
+
+template <typename Context>
+void FeedStringsKernel(const Context& dev_ctx,
+                       const phi::ExtendedTensor& x,
+                       int col,
+                       phi::ExtendedTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  auto strs_out = static_cast<framework::Strings*>(out);
+  const auto& in_str = paddle::get<framework::Strings>(feed_item);
+  strs_out->resize(in_str.size());
+  *strs_out = in_str;
+}
+
+class FeedOp : public framework::OperatorWithKernel {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "feed");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "feed");
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
+      auto& x = x_var->Get<framework::FeedList>();
+      int col = ctx->Attrs().Get<int>("col");
+      auto& feed_item = x[col];
+      if (feed_item.index() == 0) {
+        const auto& feed_item = CheckAndGetFeedItem(x, col);
+        auto& feed_tensor = PADDLE_GET_CONST(phi::DenseTensor, feed_item);
+        ctx->SetOutputDim("Out", feed_tensor.dims());
+      } else if (feed_item.index() == 1) {
+        auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item);
+        framework::Variable* out_var =
+            PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
+        out_var->GetMutable<framework::Strings>()->resize(feed_str.size());
+      } else {
+        auto& feed_sparse_tensor =
+            PADDLE_GET_CONST(phi::SparseCooTensor, feed_item);
+        framework::Variable* out_var =
+            PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
+        out_var->GetMutable<phi::SparseCooTensor>()->set_meta(
+            feed_sparse_tensor.meta());
+        out_var->GetMutable<phi::SparseCooTensor>()->SetCoalesced(
+            feed_sparse_tensor.coalesced());
+        out_var->GetMutable<phi::SparseCooTensor>()->SetIndicesDict(
+            feed_sparse_tensor.GetIndicesDict());
+      }
+    }
+  }
+
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
  }
 };

@@ -164,9 +177,152 @@ It should not be configured by users directly.
 }  // namespace operators
 }  // namespace paddle

+// TODO(YuanRisheng): Maybe we need design a new registry macro for
+// registering device independent kernels.
+
 REGISTER_OPERATOR(
    feed,
    paddle::operators::FeedOp,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    paddle::operators::FeedOpInfoMaker);
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_MKLDNN)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_XPU)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_ASCEND_CL)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_MLU)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+#endif
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel {
  }
 };

+class CumGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cumsum");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input",
+                   "Out@GRAD",
+                   "cumsum");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
+
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {

 protected:
  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("cumsum");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
+    grad_op->SetType("cumsum_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    grad_op->SetAttrMap(this->Attrs());
    grad_op->SetAttr("reverse",
-                     !PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
+                     PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
  }
 };

@@ -153,6 +175,7 @@ using CPU = phi::CPUContext;
 DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
                            CumsumInferShapeFunctor,
                            PD_INFER_META(phi::CumScalarAxisInferMeta));
+
 DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp,
                            LogcumsumexpInferShapeFunctor,
                            PD_INFER_META(phi::CumInferMeta));
@@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp,
                  ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
                  LogcumsumexpInferShapeFunctor);
 REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
+REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp);

 REGISTER_OP_VERSION(cumsum).AddCheckpoint(
    R"ROC(

--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SlogDeterminantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
-};
-
-class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "(Tensor) The input tensor of SlogDeterminant.");
-    AddOutput("Out",
-              "(Tensor) The output tensor containing the sign of the"
-              "determinant and the natural logarithm"
-              "of the absolute value of determinant,");
-
-    AddComment(R"DOC(
-SlogDeterminant Operator.)DOC");
-  }
-};
-
-class SlogDeterminantGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Input"), "Input", "Input", "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")),
-                   "Output",
-                   framework::GradVarName("Input"),
-                   "SlogDeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("slogdeterminant_grad");
-    grad_op->SetInput("Input", this->Input("Input"));
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("Input"),
-                       this->InputGrad("Input"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
-                                    "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant,
-                            SlogDeterminantInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-REGISTER_OPERATOR(slogdeterminant,
-                  ops::SlogDeterminantOp,
-                  ops::SlogDeterminantOpMaker,
-                  ops::SlogDeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>,
-                  SlogDeterminantInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant_grad,
-                            SlogDeterminantGradInferShapeFunctor,
-                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
-REGISTER_OPERATOR(slogdeterminant_grad,
-                  ops::SlogDeterminantGradOp,
-                  SlogDeterminantGradInferShapeFunctor)  // reuse det grad op
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -19,6 +19,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"

 namespace paddle {
 namespace operators {
@@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> {
  }
 };

+class ElementwiseMulGradCompositeOpMaker
+    : public prim::GradCompositeOpMakerBase {
+  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+
+ public:
+  void Apply() override {
+    auto x = this->GetSingleForwardInput("X");
+    auto y = this->GetSingleForwardInput("Y");
+    auto out_grad = this->GetSingleOutputGrad("Out");
+    auto x_grad = this->GetSingleInputGrad("X");
+    auto x_grad_p = this->GetOutputPtr(&x_grad);
+    auto x_grad_name = this->GetOutputName(x_grad);
+    auto y_grad = this->GetSingleInputGrad("Y");
+    auto y_grad_p = this->GetOutputPtr(&y_grad);
+    auto y_grad_name = this->GetOutputName(y_grad);
+    prim::multiply_grad<prim::DescTensor>(
+        x,
+        y,
+        out_grad,
+        static_cast<int>(this->Attr<int>("axis")),
+        x_grad_p,
+        y_grad_p);
+    VLOG(3) << "Runing mul_grad composite func";
+    this->RecoverOutputName(x_grad, x_grad_name);
+    this->RecoverOutputName(y_grad, y_grad_name);
+  }
+};
+
 template <typename T>
 class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 public:
@@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul,
                  ops::ElementwiseMulOpMaker,
                  ops::ElementwiseOpInferVarType,
                  ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>,
+                  ops::ElementwiseMulGradCompositeOpMaker);
 REGISTER_OPERATOR(
    elementwise_mul_grad,
    ops::ElementwiseOpGrad,

--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -20,6 +20,9 @@ limitations under the License. */

 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"

@@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
  }
 };

+class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase {
+  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+
+ public:
+  void Apply() override {
+    auto x = this->GetSingleForwardInput("X");
+    auto out_grad = this->GetSingleOutputGrad("Out");
+    auto x_grad = this->GetSingleInputGrad("X");
+    auto x_grad_p = this->GetOutputPtr(&x_grad);
+    auto x_grad_name = this->GetOutputName(x_grad);
+    auto shape = this->Attr<std::vector<int>>("shape");
+    prim::expand_grad<prim::DescTensor>(
+        x, out_grad, paddle::experimental::IntArray(shape), x_grad_p);
+    VLOG(3) << "Runing expand_v2 composite func";
+    this->RecoverOutputName(x_grad, x_grad_name);
+  }
+};
+
 template <typename T>
 class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 public:
@@ -223,6 +244,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand_v2,
                  ops::ExpandV2Op,
                  ops::ExpandV2OpMaker,
+                  ops::ExpandV2GradCompositeOpMaker,
                  ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                  ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>,
                  ExpandInferShapeFunctor);

--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
            op_item['no_need_buffer'] = get_param_list_alias(
                op_item['no_need_buffer'], args_map
            )
+        if 'data_transform' in op_item and op_item['data_transform']:
+            data_trans_item = op_item['data_transform']
+            if 'skip_transform' in data_trans_item:
+                data_trans_item['skip_transform'] = get_param_list_alias(
+                    data_trans_item['skip_transform'], args_map
+                )
+            if 'support_trans_dtype' in data_trans_item:
+                data_trans_item['support_trans_dtype'] = get_param_list_alias(
+                    data_trans_item['support_trans_dtype'], args_map
+                )

        process_scalar(op_item, scalar_configs)
        process_int_array(op_item, int_array_configs)

--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
    else:
        no_buffer_args = None

-    # TODO(chenfeiyu): data_transform
+    # add data_transform tag for every input.
+    # the format is {data_transform : {skip_transform : [x, z], support_trans_dtype : y}}
+    for input in inputs:
+        input["data_transform"] = {}
+    if "data_transform" in op_entry:
+        skip_trans_args = []
+        support_trans_args = []
+        data_trans = op_entry["data_transform"]
+        if "skip_transform" in data_trans:
+            skip_trans_args = parse_plain_list(data_trans["skip_transform"])
+            for name in skip_trans_args:
+                assert (
+                    name in input_names
+                ), f"{op_name} has an skip_transform input: '{name}' which is not an input."
+            data_trans["skip_transform"] = skip_trans_args
+        if "support_trans_dtype" in data_trans:
+            support_trans_args = parse_plain_list(
+                data_trans["support_trans_dtype"]
+            )
+            for name in support_trans_args:
+                assert (
+                    name in input_names
+                ), f"{op_name} has an support_trans_dtype input: '{name}' which is not an input."
+            data_trans["support_trans_dtype"] = support_trans_args
+        for input in inputs:
+            if input["name"] in skip_trans_args:
+                input["data_transform"]["skip_trans_args"] = True
+            else:
+                input["data_transform"]["skip_trans_args"] = False
+            if input["name"] in support_trans_args:
+                input["data_transform"]["support_trans_dtype"] = True
+            else:
+                input["data_transform"]["support_trans_dtype"] = False
+    else:
+        data_trans = None

    op = {
        "name": op_name,
@@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
        "attrs": attrs,
        "outputs": outputs,
        "no_need_buffer": no_buffer_args,
+        "data_transform": data_trans,
    }

    # invokes another op ?

--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -117,6 +117,15 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}

 {# --------------------------------------- name mapping ---------------------------------------------- #}
 {% macro name_map(op) %}
+/*
+******************************************************************
+NOTE: The following codes are for 'get_compat_kernel_signature.py'
+All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
+
+{{op | cartesian_prod_mapping}}
+******************************************************************
+*/
+
 KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
  {% set kernel_args = op["kernel"]["param"] %}
  {{get_input_list(op["inputs"], kernel_args)}};
@@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
  return sig;
  {%endif%}
 }
-
-/*
-******************************************************************
-NOTE: The following codes are for 'get_compat_kernel_signature.py'
-All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
-
-{{op | cartesian_prod_mapping}}
-******************************************************************
-*/
 {% endmacro %}

 {% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #}
@@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
 {%- endmacro %}

 {% macro sparse_op_name_map(op) %}
+/*
+******************************************************************
+NOTE: The following codes are for 'get_compat_kernel_signature.py'
+All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
+
+{{op | cartesian_prod_mapping}}
+******************************************************************
+*/
+
 KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
  {% set kernel_args = op["kernel"]["param"] %}
  {{get_input_list(op["inputs"], kernel_args)}};
@@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
  KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs));
  return sig;
 }
-
-/*
-******************************************************************
-NOTE: The following codes are for 'get_compat_kernel_signature.py'
-All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
-
-{{op | cartesian_prod_mapping}}
-******************************************************************
-*/
 {% endmacro %}

 {% macro register_base_kernel_name(op) %}
@@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType(
 }
 {% endmacro %}

+{% macro get_kernel_for_var(op) %} {# only for data_transform #}
+{% set skip_args = op["data_transform"]["skip_transform"] %}
+{% set var_name = "var_name" %}
+{% set skip_args_len = skip_args | length %}
+phi::KernelKey GetKernelTypeForVar(
+    const std::string& {{var_name}},
+    const phi::DenseTensor& tensor,
+    const phi::KernelKey& expected_kernel_type) const override {
+    
+      if (
+        {%- for skip_arg in skip_args -%}
+          var_name == "{{ skip_arg }}"
+          {%- if skip_args_len != 1 and loop.index != skip_args_len %} || {% endif -%}
+        {%- endfor -%}
+      ){
+        return phi::KernelKey(phi::Backend::ALL_BACKEND,
+                            expected_kernel_type.layout(),
+                            expected_kernel_type.dtype());
+      }
+      else{
+          return phi::KernelKey(
+            tensor.place(), tensor.layout(), expected_kernel_type.dtype());
+      }
+    }
+{% endmacro %}
+
 {# --------------------------------------- operator  ---------------------------------------------- #}
 {% macro operator(op) %}
 class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
@@ -296,6 +322,14 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne
    {% filter indent(2, True)%}
 {{get_expected_kernel(op)}}
    {% endfilter %}
+    {%- if "data_transform" in op and op["data_transform"] is not none -%}
+      {%- if "skip_transform" in op["data_transform"] -%}
+        {% filter indent(2, True) %}
+{{get_kernel_for_var(op)}}
+        {% endfilter %}
+      {%- endif %}
+    {%- endif -%}
+{# TODO(lizhiyu): add the 'support_trans_dtype' #}
  {% endif %}
 };


--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/ternary.h"
-
-namespace paddle {
-namespace operators {
-
-class GraphSendRecvOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendRecvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor with data type float32, float64, int32, int64.");
-    AddInput("Src_index", "The source index tensor.");
-    AddInput("Dst_index", "The destination index tensor.");
-    AddInput("Out_size",
-             "(Tensor<int>, optional). The 0th dimension of the output."
-             "It has a higher priority than Attr(out_size).")
-        .AsDispensable();
-    AddOutput("Out", "Output tensor of graph_send_recv op.");
-    AddOutput("Dst_count",
-              "Count tensor of Dst_index, mainly for MEAN reduce_op.")
-        .AsIntermediate();
-    AddAttr<std::string>("reduce_op",
-                         "(string, default 'SUM')"
-                         "Define different pool types to receive the result "
-                         "tensors of Dst_index.")
-        .SetDefault("SUM")
-        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    AddAttr<std::vector<int64_t>>(
-        "out_size",
-        "(vector<int64_t>, default {0})"
-        "Define the first dimension of Output tensor."
-        "If set default {0}, then the shape of Out is the same with X.")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Graph Learning Send_Recv combine operator.
-
-$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
-
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
-intermediate memory consumption in the process of message passing.
-Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
-and then use `dst_index` to update the corresponding position of output tensor in different
-pooling types, like sum, mean, max, or min.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("graph_send_recv_grad");
-    op->SetInput("Src_index", this->Input("Src_index"));
-    op->SetInput("Dst_index", this->Input("Dst_index"));
-    op->SetInput("X", this->Input("X"));
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
-      op->SetInput("Dst_count", this->Output("Dst_count"));
-    }
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
-        PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
-      op->SetInput("Out", this->Output("Out"));
-    }
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv,
-                            GraphSendRecvInferShapeFunctor,
-                            PD_INFER_META(phi::SendURecvInferMeta));
-REGISTER_OPERATOR(graph_send_recv,
-                  ops::GraphSendRecvOP,
-                  ops::GraphSendRecvOpMaker,
-                  ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
-                  GraphSendRecvInferShapeFunctor);
-REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
--- a/paddle/fluid/operators/graph_send_ue_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class GraphSendUERecvOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendUERecvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-    auto y_dims = ctx->GetInputDim("Y");
-    ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor with data type float32, float64, int32, int64.");
-    AddInput("Y",
-             "The input edge weight tensor, data type should be same with X");
-    AddInput("Src_index", "The source index tensor.");
-    AddInput("Dst_index", "The destination index tensor.");
-    AddInput("Out_size",
-             "(Tensor<int>, optional). The 0th dimension of the output."
-             "It has a higher priority than Attr(out_size).")
-        .AsDispensable();
-    AddOutput("Out", "Output tensor of graph_send_ue_recv op.");
-    AddOutput("Dst_count",
-              "Count tensor of Dst_index, mainly for MEAN reduce_op.")
-        .AsIntermediate();
-    AddAttr<std::string>("message_op",
-                         "(string, default 'ADD')"
-                         "Define differenct computation types between X and E.")
-        .SetDefault("ADD")
-        .InEnum({"ADD", "MUL"});
-    AddAttr<std::string>("reduce_op",
-                         "(string, default 'SUM')"
-                         "Define different pool types to receive the result "
-                         "tensors of Dst_index.")
-        .SetDefault("SUM")
-        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    AddAttr<std::vector<int64_t>>(
-        "out_size",
-        "(vector<int64_t>, default {0})"
-        "Define the first dimension of Output tensor."
-        "If set default {0}, then the shape of Out is the same with X.")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Graph Learning Send_UE_Recv combine operator.
-
-$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$
-
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
-intermediate memory consumption in the process of message passing.
-
-Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
-Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
-and get the computation result. Then, use `dst_index` to update the corresponding position of output
-tensor in different pooling types, like sum, mean, max, or min.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("graph_send_ue_recv_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput("Src_index", this->Input("Src_index"));
-    op->SetInput("Dst_index", this->Input("Dst_index"));
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
-      op->SetInput("Dst_count", this->Output("Dst_count"));
-    }
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
-        PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
-      op->SetInput("Out", this->Output("Out"));
-    }
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv,
-                            GraphSendUERecvInferShapeFunctor,
-                            PD_INFER_META(phi::SendUERecvInferMeta));
-REGISTER_OPERATOR(graph_send_ue_recv,
-                  ops::GraphSendUERecvOP,
-                  ops::GraphSendUERecvOpMaker,
-                  ops::GraphSendUERecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendUERecvGradOpMaker<paddle::imperative::OpBase>,
-                  GraphSendUERecvInferShapeFunctor);
-REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp);
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase {

    // get output orginal name
    std::string x_grad_name = this->GetOutputName(x_grad_t);
-
+    VLOG(3) << "Runing sum_grad composite func";
    // call composite backward func
    prim::sum_grad<prim::DescTensor>(
        x, out_grad, axis, keep_dim, reduce_all, x_grad);

--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *context) const override {
    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput");
    OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput");
-    OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput");
+    OP_INOUT_CHECK(
+        context->HasOutputs("Out", true), "Output", "Out", "SelectOutput");
  }
 };


--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SizeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto dtype = framework::proto::VarType::FP32;  // dtype is not important
-    return phi::KernelKey(dtype, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                          expected_kernel_type.layout(),
-                          expected_kernel_type.dtype());
-  }
-};
-
-class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "The input tensor.");
-    AddOutput("Out",
-              "The returned tensor, the data type "
-              "is int64_t, will be on the same device with the input Tensor.");
-    AddComment(R"DOC(
-Size Operator.
-
-Return the number of elements in the input.
-)DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(size,
-                            SizeInferShapeFunctor,
-                            PD_INFER_META(phi::NumelInferMeta));
-REGISTER_OPERATOR(
-    size,
-    ops::SizeOp,
-    ops::SizeOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    SizeInferShapeFunctor,
-    ops::SizeOpNoNeedBufferVarInferer);
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -407,8 +407,8 @@ int BertTokenizer::Encode(

 void BertTokenizer::BatchEncode(
    vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
-    const vector<string>& batch_text,
-    const vector<string>& batch_text_pair /* = vector<string>() */,
+    const framework::Strings& batch_text,
+    const framework::Strings& batch_text_pair /* = vector<string>() */,
    bool is_split_into_words /* = false */,
    const size_t max_seq_len /* = 0 */,
    bool pad_to_max_seq_len /* = false */) const {

--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -100,8 +100,8 @@ class BertTokenizer {
             bool pad_to_max_seq_len = false) const;
  void BatchEncode(
      vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
-      const vector<string>& batch_text,
-      const vector<string>& batch_text_pair = vector<string>(),
+      const framework::Strings& batch_text,
+      const framework::Strings& batch_text_pair = framework::Strings(),
      bool is_split_into_words = false,
      const size_t max_seq_len = 0,
      bool pad_to_max_seq_len = false) const;
@@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
    } else {
      tokenizer.BatchEncode(&batch_encode_inputs,
                            *text,
-                            vector<string>(),
+                            framework::Strings(),
                            is_split_into_words,
                            max_seq_len,
                            pad_to_max_seq_len);

--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/all_context.h"

 DECLARE_bool(use_stream_safe_cuda_allocator);
+DECLARE_bool(new_executor_use_cuda_graph);

 namespace paddle {
 namespace platform {
@@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place,
  auto stream = dev_ctx->stream();
  CUDAGraph::BeginCapture(place, stream, mode);

-  auto old_value = FLAGS_use_stream_safe_cuda_allocator;
+  // When using cuda graph in new executor, fast GC must be used.
+  // FLAGS_use_stream_safe_cuda_allocator should be true.
+  auto old_value = FLAGS_use_stream_safe_cuda_allocator &&
+                   !FLAGS_new_executor_use_cuda_graph;
  if (old_value) {
    FLAGS_use_stream_safe_cuda_allocator = false;
  }

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -20,6 +20,7 @@ namespace platform {
 void CudaProfilerInit(const std::string& output_file,
                      const std::string& output_mode,
                      const std::string& config_file) {
+#if CUDA_VERSION < 11000
  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
                 platform::errors::InvalidArgument(
                     "Unsupported cuda profiler output mode, expect `kvp` or "
@@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file,
  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+#endif
 }

 void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext {
  virtual ~XPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
+  void CreateStream() { XPUContext::CreateStream(); }
 };

 template <>

--- a/paddle/fluid/prim/api/.gitignore
+++ b/paddle/fluid/prim/api/.gitignore
+generated/prim_api/eager_prim_api.cc
+generated/prim_api/tmp_eager_prim_api.cc
+generated/prim_api/*.h
--- a/paddle/fluid/prim/api/CMakeLists.txt
+++ b/paddle/fluid/prim/api/CMakeLists.txt
+add_subdirectory(auto_code_generated)
 add_subdirectory(manual)
+add_subdirectory(generated)
+
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    prim_api

--- a/paddle/fluid/prim/api/all.h
+++ b/paddle/fluid/prim/api/all.h
@@ -13,6 +13,6 @@
 // limitations under the License.

 #pragma once
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/utils/utils.h"
--- a/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
+++ b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
+set(api_yaml_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml"
+)
+set(legacy_api_yaml_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml"
+)
+set(tmp_eager_prim_api_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc"
+)
+set(tmp_prim_api_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h"
+)
+set(eager_prim_api_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc"
+)
+set(prim_api_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h")
+set(prim_api_gen_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py)
+
+message("prim api Code gen")
+execute_process(
+  WORKING_DIRECTORY
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${prim_api_gen_file} --api_yaml_path
+    ${legacy_api_yaml_path} ${api_yaml_path} --prim_api_header_path
+    ${tmp_prim_api_h_path} --eager_prim_api_source_path
+    ${tmp_eager_prim_api_cc_path}
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "prim api genrate failed, exiting.")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                        ${tmp_prim_api_h_path} ${prim_api_h_path})
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                        ${tmp_eager_prim_api_cc_path} ${eager_prim_api_cc_path})
+message("copy tmp_xxx_prim_api to xxx_prim_api")
--- a/paddle/fluid/prim/api/auto_code_generated/prim_base.py
+++ b/paddle/fluid/prim/api/auto_code_generated/prim_base.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# prim api list
+white_ops_list = [
+    "pow",
+    "scale",
+    "multiply",
+    "unsqueeze",
+    "expand",
+    "full",
+    "reshape",
+    "divide",
+    "sum",
+    "exp",
+]
+
+inplace_out_type_map = {
+    "Tensor": "Tensor&",
+    "std::vector<Tensor>": "std::vector<Tensor>&",
+}
+
+inplace_optional_out_type_map = {
+    "Tensor": "paddle::optional<Tensor>&",
+    "std::vector<Tensor>": "paddle::optional<std::vector<Tensor>>&",
+}
+
+
+class BaseAPI:
+    def __init__(self, api_item_yaml):
+        # self.api = api_item_yaml['op']
+        self.api = api_item_yaml['name']
+
+        self.is_prim_api = False
+        if api_item_yaml['name'] in white_ops_list:
+            self.is_prim_api = True
+
+        #######################################
+        # inputs:
+        #     names : [], list of input names
+        #     input_info : {input_name : type}
+        # attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}
+        # outputs:
+        #     names : [], list of output names
+        #     types : [], list of output types
+        #     out_size_expr : [], expression for getting size of vector<Tensor>
+        ########################################
+        if self.is_prim_api:
+            (
+                self.inputs,
+                self.attrs,
+                self.outputs,
+                self.optional_vars,
+            ) = self.parse_args(self.api, api_item_yaml)
+
+            self.inplace_map = api_item_yaml['inplace']
+
+    def get_api_func_name(self):
+        return self.api
+
+    # def is_inplace(self):
+    #     if self.inplace_map
+    #         return True
+    #     return False
+
+    def get_input_tensor_args(self, inplace_flag=False):
+        input_args = []
+        inplace_type_map = {
+            "const Tensor&": "Tensor&",
+            "const paddle::optional<Tensor>&": "paddle::optional<Tensor>&",
+            "const std::vector<Tensor>&": "std::vector<Tensor>&",
+            "const paddle::optional<std::vector<Tensor>>&": "paddle::optional<std::vector<Tensor>>&",
+        }
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(
+                    inplace_type_map[self.inputs['input_info'][name]]
+                    + ' '
+                    + name
+                )
+            else:
+                input_args.append(self.inputs['input_info'][name] + ' ' + name)
+        return input_args
+
+    def get_declare_args(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            declare_args.append(
+                self.attrs['attr_info'][name][0] + ' ' + name + default_value
+            )
+
+        return ", ".join(declare_args)
+
+    def get_declare_args_nodefault(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
+
+        return ", ".join(declare_args)
+
+    def get_return_type(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                if self.inplace_map[out_name] in self.optional_vars:
+                    out_type_list.append(
+                        inplace_optional_out_type_map[out_type]
+                    )
+                else:
+                    out_type_list.append(inplace_out_type_map[out_type])
+            else:
+                out_type_list.append(out_type)
+        if len(out_type_list) == 1:
+            return out_type_list[0]
+        else:
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
+
+    def parse_args(self, api_name, api_item_yaml):
+        optional_vars = []
+        for input_dict in api_item_yaml['inputs']:
+            if input_dict['optional']:
+                optional_vars.append(input_dict['name'])
+
+        inputs, attrs = self.parse_input_and_attr(
+            api_item_yaml['inputs'], api_item_yaml['attrs']
+        )
+
+        output_type_list, output_names, out_size_expr = self.parse_output(
+            api_item_yaml['outputs']
+        )
+        return (
+            inputs,
+            attrs,
+            {
+                'names': output_names,
+                'types': output_type_list,
+                'out_size_expr': out_size_expr,
+            },
+            optional_vars,
+        )
+
+    def parse_input_and_attr(self, inputs_list, attrs_list):
+        input_types_map = {
+            'Tensor': 'const Tensor&',
+            'Tensor[]': 'const std::vector<Tensor>&',
+        }
+        attr_types_map = {
+            'IntArray': 'const IntArray&',
+            'Scalar': 'const Scalar&',
+            'Scalar(int)': 'const Scalar&',
+            'Scalar(int64_t)': 'const Scalar&',
+            'Scalar(float)': 'const Scalar&',
+            'Scalar(dobule)': 'const Scalar&',
+            'Scalar[]': 'const std::vector<phi::Scalar>&',
+            'int': 'int',
+            'int32_t': 'int32_t',
+            'int64_t': 'int64_t',
+            'long': 'long',
+            'size_t': 'size_t',
+            'float': 'float',
+            'float[]': 'const std::vector<float>&',
+            'double': 'double',
+            'bool': 'bool',
+            'bool[]': 'const std::vector<bool>&',
+            'str': 'const std::string&',
+            'str[]': 'const std::vector<std::string>&',
+            'Place': 'const Place&',
+            'DataLayout': 'DataLayout',
+            'DataType': 'DataType',
+            'int64_t[]': 'const std::vector<int64_t>&',
+            'int[]': 'const std::vector<int>&',
+        }
+        optional_types_trans = {
+            'Tensor': 'const paddle::optional<Tensor>&',
+            'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+            'int': 'paddle::optional<int>',
+            'int32_t': 'paddle::optional<int32_t>',
+            'int64_t': 'paddle::optional<int64_t>',
+            'float': 'paddle::optional<float>',
+            'double': 'paddle::optional<double>',
+            'bool': 'paddle::optional<bool>',
+            'Place': 'paddle::optional<const Place&>',
+            'DataLayout': 'paddle::optional<DataLayout>',
+            'DataType': 'paddle::optional<DataType>',
+        }
+
+        inputs = {'names': [], 'input_info': {}}
+        for input_dict in inputs_list:
+            inputs['names'].append(input_dict['name'])
+            if input_dict['optional']:
+                inputs['input_info'][input_dict['name']] = optional_types_trans[
+                    input_dict['typename']
+                ]
+            else:
+                inputs['input_info'][input_dict['name']] = input_types_map[
+                    input_dict['typename']
+                ]
+
+        attrs = {'names': [], 'attr_info': {}}
+        for attr_dict in attrs_list:
+            attrs['names'].append(attr_dict['name'])
+            if 'default_value' in attr_dict.keys():
+                default_value = attr_dict['default_value']
+            else:
+                default_value = None
+
+            if 'optional' in attr_dict.keys():
+                attrs['attr_info'][attr_dict['name']] = (
+                    optional_types_trans[attr_dict['typename']],
+                    default_value,
+                )
+            else:
+                attrs['attr_info'][attr_dict['name']] = (
+                    attr_types_map[attr_dict['typename']],
+                    default_value,
+                )
+        return inputs, attrs
+
+    def parse_output(self, outputs_list):
+
+        out_type_list = []
+        out_name_list = []
+        out_size_expr_list = []
+        for output_dict in outputs_list:
+            if output_dict['intermediate']:
+                continue
+            out_type_list.append(output_dict['typename'])
+            out_name_list.append(output_dict['name'])
+            if 'size' in output_dict.keys():
+                out_size_expr_list.append(output_dict['size'])
+            else:
+                out_size_expr_list.append(None)
+        return out_type_list, out_name_list, out_size_expr_list
+
+
+class EagerPrimAPI(BaseAPI):
+    def __init__(self, api_item_yaml):
+        super().__init__(api_item_yaml)
+
+    def get_api__func_name(self):
+        api_func_name = self.api
+        # if self.is_inplace:
+        #     if api_func_name[-1] != '_':
+        #         api_func_name += '_'
+        # print("after api name", api_func_name)
+        return api_func_name
+
+    def gene_prim_api_declaration(self):
+        api_declaration = ""
+        api_func_name = self.get_api__func_name()
+        if api_func_name[-1] != '_':
+            api_declaration = f"""
+template <typename T>
+{self.get_return_type()} {api_func_name}({self.get_declare_args()});
+"""
+        else:
+            api_declaration = (
+                api_declaration
+                + f"""
+template <typename T>
+{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
+"""
+            )
+
+        return api_declaration
+
+    def get_ad_func_input_args(self, inplace_flag=False):
+        input_args = []
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(name)
+            else:
+                input_args.append(name)
+        return input_args
+
+    def get_ad_func_args(self, inplace_flag=False):
+        ad_func_args = self.get_ad_func_input_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            ad_func_args.append(name)
+
+        ad_func_args_str = ", ".join(ad_func_args)
+        return ad_func_args_str
+
+    def gene_ad_func_call(self):
+        api_func_name = self.get_api__func_name()
+
+        dygraph_ad_func_name = '::' + api_func_name + '_ad_func'
+        dygraph_ad_func_parameters = self.get_ad_func_args()
+
+        ad_func_call_str = f"""
+VLOG(4) << "Eager Prim API {api_func_name}_ad_func call";
+return {dygraph_ad_func_name}({dygraph_ad_func_parameters});
+"""
+        # print("ad_func_call_str: ", ad_func_call_str)
+        return ad_func_call_str
+
+    def gene_eager_prim_api_code(self):
+        api_code = ""
+        indent = "  "
+        api_func_name = self.get_api__func_name()
+        template = '<Tensor>'
+        # func decalaration
+        if api_func_name[-1] != '_':
+            api_code = f"""
+template <>
+{self.get_return_type()} {api_func_name}{template}({self.get_declare_args_nodefault()})
+"""
+        else:
+            api_code = f"""
+template <>
+{self.get_return_type(inplace_flag=True)} {api_func_name}{template}({self.get_declare_args_nodefault(inplace_flag=True)})
+"""
+        # func code
+
+        api_code = api_code + '{'
+        api_code += f"""{self.gene_ad_func_call()}"""
+        api_code += '}' + '\n'
+
+        return api_code
--- a/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import yaml
+from prim_base import EagerPrimAPI
+
+
+def header_include():
+    return """
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def eager_source_include(header_file_path):
+    return """
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
+"""
+
+
+def api_namespace():
+    return (
+        """
+namespace paddle {
+namespace prim {
+""",
+        """
+using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::Scalar;
+using IntArray = paddle::experimental::IntArray;
+using DataType = paddle::experimental::DataType;
+""",
+        """
+}  // namespace prim
+}  // namespace paddle
+""",
+    )
+
+
+def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path):
+    apis = []
+
+    for each_api_yaml in api_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                apis.extend(api_list)
+
+    header_file = open(header_file_path, 'w')
+    eager_prim_source_file = open(eager_prim_source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+    header_file.write(namespace[1])
+    include_header_file = (
+        "#include paddle/fluid/prim/api/generated/prim_api/prim_api.h"
+    )
+    eager_prim_source_file.write(eager_source_include(include_header_file))
+    eager_prim_source_file.write(namespace[0])
+
+    for api in apis:
+        prim_api = EagerPrimAPI(api)
+        if prim_api.is_prim_api:
+            header_file.write(prim_api.gene_prim_api_declaration())
+            eager_prim_source_file.write(prim_api.gene_eager_prim_api_code())
+
+    header_file.write(namespace[2])
+    eager_prim_source_file.write(namespace[2])
+
+    header_file.close()
+    eager_prim_source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files'
+    )
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to api yaml file',
+        nargs='+',
+        default=['paddle/phi/api/yaml/ops.yaml'],
+    )
+
+    parser.add_argument(
+        '--prim_api_header_path',
+        help='output of generated prim_api header code file',
+        default='paddle/fluid/prim/api/generated/prim_api/prim_api.h',
+    )
+
+    parser.add_argument(
+        '--eager_prim_api_source_path',
+        help='output of generated eager_prim_api source code file',
+        default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc',
+    )
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    prim_api_header_file_path = options.prim_api_header_path
+    eager_prim_api_source_file_path = options.eager_prim_api_source_path
+
+    generate_api(
+        api_yaml_path,
+        prim_api_header_file_path,
+        eager_prim_api_source_file_path,
+    )
+
+
+if __name__ == '__main__':
+    main()
--- a/paddle/fluid/prim/api/generated/CMakeLists.txt
+++ b/paddle/fluid/prim/api/generated/CMakeLists.txt
+add_subdirectory(prim_api)
--- a/paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt
+++ b/paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt
--- a/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc
+++ b/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/program_desc.h"

 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/utils/utils.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
@@ -37,7 +38,7 @@ namespace paddle {
 namespace prim {

 template <>
-Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
+Tensor pow<DescTensor>(const Tensor& x, const Scalar& y) {
  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
  framework::OpDesc* op = block->AppendOp();
@@ -55,7 +56,7 @@ Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {

 template <>
 Tensor scale<DescTensor>(const Tensor& x,
-                         const paddle::experimental::Scalar& scale,
+                         const Scalar& scale,
                         float bias,
                         bool bias_after_scale) {
  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
@@ -95,63 +96,63 @@ Tensor multiply<DescTensor>(const Tensor& x, const Tensor& y) {
 }

 template <>
-Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
+Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
  framework::OpDesc* op = block->AppendOp();
-  op->SetType("expand_v2");
+  op->SetType("unsqueeze2");
  op->SetInput("X",
               {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
  op->SetOutput(
      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
-  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
-  op->SetAttr("shape", new_shape);
+  std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
+  op->SetAttr("axes", new_shape);
  op->CheckAttrs();
  op->InferVarType(block);
  return out;
 }

 template <>
-Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
-  // Grad infershape
+Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
  framework::OpDesc* op = block->AppendOp();
-  op->SetType("elementwise_div");
+  op->SetType("expand_v2");
  op->SetInput("X",
               {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
-  op->SetInput("Y",
-               {std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
  op->SetOutput(
      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
+  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
+  op->SetAttr("shape", new_shape);
  op->CheckAttrs();
  op->InferVarType(block);
-  op->InferShape(*block);
  return out;
 }

 template <>
-Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
+Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
+  // Grad infershape
  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
  framework::OpDesc* op = block->AppendOp();
-  op->SetType("unsqueeze2");
+  op->SetType("elementwise_div");
  op->SetInput("X",
               {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
+  op->SetInput("Y",
+               {std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
  op->SetOutput(
      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
-  std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
-  op->SetAttr("axes", new_shape);
  op->CheckAttrs();
  op->InferVarType(block);
+  op->InferShape(*block);
  return out;
 }

 template <>
-Tensor full<DescTensor>(paddle::experimental::IntArray shape,
-                        paddle::experimental::Scalar value,
-                        paddle::experimental::DataType dtype,
-                        paddle::platform::Place place) {
+Tensor full<DescTensor>(const IntArray& shape,
+                        const Scalar& value,
+                        DataType dtype,
+                        const Place& place) {
  // Grad infershape
  Tensor out = empty<DescTensor>({}, dtype, place);
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
@@ -159,9 +160,8 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
  op->SetType("fill_constant");
  op->SetAttr("shape", shape.GetData());
  PADDLE_ENFORCE_EQ(
-      ((dtype == paddle::experimental::DataType::FLOAT32) ||
-       (dtype == paddle::experimental::DataType::FLOAT64) ||
-       (dtype == paddle::experimental::DataType::FLOAT16)),
+      ((dtype == DataType::FLOAT32) || (dtype == DataType::FLOAT64) ||
+       (dtype == DataType::FLOAT16)),
      true,
      phi::errors::InvalidArgument(
          "We only support float32/float16 for full, but we got data type: %s",
@@ -177,9 +177,9 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
 }

 template <>
-Tensor sum<DescTensor>(Tensor x,
-                       paddle::experimental::IntArray axis,
-                       paddle::experimental::DataType dtype,
+Tensor sum<DescTensor>(const Tensor& x,
+                       const IntArray& axis,
+                       DataType dtype,
                       bool keepdim) {
  // Grad infershape
  Tensor out = empty<DescTensor>({}, dtype, paddle::Place());
@@ -199,12 +199,12 @@ Tensor sum<DescTensor>(Tensor x,
      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
  op->CheckAttrs();
  op->InferVarType(block);
-  // TODO(jiabin): This may have runtime shape skip infershape for now.
+  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
  return out;
 }

 template <>
-Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
+Tensor reshape<DescTensor>(const Tensor& x, const IntArray& shape) {
  // Grad infershape
  Tensor out = empty<DescTensor>({}, x.dtype(), paddle::Place());
  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
@@ -222,7 +222,23 @@ Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
  op->CheckAttrs();
  op->InferVarType(block);
-  // TODO(jiabin): This may have runtime shape skip infershape for now.
+  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
+  return out;
+}
+
+template <>
+Tensor exp<DescTensor>(const Tensor& x) {
+  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
+  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("exp");
+  op->SetInput("X",
+               {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
+  op->SetOutput(
+      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
+  op->CheckAttrs();
+  op->InferVarType(block);
+  op->InferShape(*block);
  return out;
 }
 }  // namespace prim

--- a/paddle/fluid/prim/api/manual/CMakeLists.txt
+++ b/paddle/fluid/prim/api/manual/CMakeLists.txt
-add_subdirectory(prim_api)
 add_subdirectory(utils)
--- a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
@@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x,
          sum<T>(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
      auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
      set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
+
    } else {
      by_pass<T>(scale_out_grad, dy);
    }
@@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x,
          sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
      auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
      set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
    } else {
      by_pass<T>(out_grad, dx);
    }
@@ -84,7 +83,6 @@ void add_grad(const Tensor& x,
          sum<T>(out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
      auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
      set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
    } else {
      by_pass<T>(out_grad, dy);
    }
@@ -97,7 +95,6 @@ void add_grad(const Tensor& x,
          sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
      auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
      set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
    } else {
      by_pass<T>(out_grad, dx);
    }
@@ -139,7 +136,6 @@ void sum_grad(const Tensor& x,
    x_grad_tmp = expand<T>(out_grad, x_dim);
  }
  set_output<T>(x_grad_tmp, x_grad);
-  // x_grad->set_impl(x_grad_tmp.impl());
 }

 template <typename T>
@@ -163,10 +159,8 @@ void divide_grad(const Tensor& x,
          sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false);
      auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
      set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
    } else {
      set_output<T>(dy_res, dy);
-      // dy->set_impl(dy_res.impl());
    }
  }  // indicate we will compute dy
  if (dx) {
@@ -181,10 +175,8 @@ void divide_grad(const Tensor& x,
          sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false);
      auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
      set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
    } else {
      set_output<T>(dx_res, dx);
-      // dx->set_impl(dx_res.impl());
    }
  }  // indicate we will compute dx
 }
@@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
    auto tmp = divide<T>(div_x, out);
    auto x_grad_tmp = multiply<T>(out_grad, tmp);
    set_output<T>(x_grad_tmp, x_grad);
-    // x_grad->set_impl(x_grad_tmp.impl());
  }
 }
 }  // namespace prim

--- a/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc
+++ b/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/api/all.h"
-#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
-#include "paddle/phi/capi/include/wrapper_base.h"
-namespace paddle {
-namespace prim {
-template <>
-Tensor pow<Tensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
-  return ::pow_ad_func(x, y);
-}
-
-template <>
-Tensor scale<Tensor>(const Tensor& x,
-                     const paddle::experimental::Scalar& scale,
-                     float bias,
-                     bool bias_after_scale) {
-  return ::scale_ad_func(x, scale, bias, bias_after_scale);
-}
-
-template <>
-Tensor multiply<Tensor>(const Tensor& x, const Tensor& y) {
-  return ::multiply_ad_func(x, y);
-}
-
-template <>
-Tensor expand<Tensor>(const Tensor& x, const IntArray& shape) {
-  return ::expand_ad_func(x, shape);
-}
-
-template <>
-Tensor unsqueeze<Tensor>(const Tensor& x, const IntArray& axis) {
-  return ::unsqueeze_ad_func(x, axis);
-}
-
-template <>
-Tensor divide<Tensor>(const Tensor& x, const Tensor& y) {
-  return ::divide_ad_func(x, y);
-}
-
-template <>
-Tensor full<Tensor>(paddle::experimental::IntArray shape,
-                    paddle::experimental::Scalar value,
-                    paddle::experimental::DataType dtype,
-                    paddle::platform::Place place) {
-  return ::full_ad_func(shape, value, dtype, place);
-}
-template <>
-Tensor sum<Tensor>(Tensor x, IntArray axis, DataType dtype, bool keepdim) {
-  return ::sum_ad_func(x, axis, dtype, keepdim);
-}
-
-template <>
-Tensor reshape<Tensor>(Tensor x, IntArray shape) {
-  return ::reshape_ad_func(x, shape);
-}
-}  // namespace prim
-}  // namespace paddle
--- a/paddle/fluid/prim/api/manual/prim_api/prim_api.h
+++ b/paddle/fluid/prim/api/manual/prim_api/prim_api.h
@@ -12,50 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+// prim api which can't be generated
 #pragma once
+
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/optional.h"
-namespace paddle {
-namespace prim {
-using Tensor = paddle::experimental::Tensor;
-using IntArray = paddle::experimental::IntArray;
-using Scalar = paddle::experimental::Scalar;
-
-template <typename T>
-Tensor pow(const Tensor& x, const Scalar& y);
-
-template <typename T>
-Tensor scale(const Tensor& X,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale);
-
-template <typename T>
-Tensor multiply(const Tensor& x, const Tensor& y);
-
-template <typename T>
-Tensor expand(const Tensor& x, const IntArray& shape);

-template <typename T>
-Tensor unsqueeze(const Tensor& x, const IntArray& axis);
-
-template <typename T>
-Tensor divide(const Tensor& x, const Tensor& y);
-
-template <typename T>
-Tensor full(IntArray shape,
-            Scalar value,
-            DataType dtype = DataType::FLOAT32,
-            Place place = CPUPlace());
-
-template <typename T>
-Tensor sum(Tensor x,
-           IntArray axis = {},
-           DataType dtype = DataType::UNDEFINED,
-           bool keepdim = false);
-
-template <typename T>
-Tensor reshape(Tensor x, IntArray shape);
-}  // namespace prim
+namespace paddle {
+namespace prim {}  // namespace prim
 }  // namespace paddle
--- a/paddle/fluid/prim/api/manual/utils/utils.h
+++ b/paddle/fluid/prim/api/manual/utils/utils.h
@@ -16,11 +16,12 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
-using IntArray = paddle::experimental::IntArray;
+
 namespace paddle {
 namespace prim {
 // We put some api like utils here
@@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp,
                paddle::experimental::Tensor* x);

 // These method don't need to be specified
-static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
-                                 const phi::DDim& y_dims) {
+static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
+                                          const phi::DDim& in_dims) {
  std::vector<int64_t> result;
-  PADDLE_ENFORCE_GE(phi::product(x_dims),
-                    phi::product(y_dims),
-                    phi::errors::InvalidArgument(
-                        "Only x_dims >= y_dims is accepted for "
-                        "get_reduce_dims, but we got x_dims: %s, y_dims: %s",
-                        x_dims,
-                        y_dims));
-  int bat = x_dims.size() - y_dims.size();
+  int bat = dout_dims.size() - in_dims.size();
  for (int i = 0; i < bat; ++i) {
    result.push_back(i);
  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    if (y_dims[i] == 1) {
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (in_dims[i] == 1) {
      result.push_back(i + bat);
    } else {
      PADDLE_ENFORCE_EQ(
-          y_dims[i],
-          x_dims[i + bat],
+          in_dims[i],
+          dout_dims[i + bat],
          platform::errors::InvalidArgument(
              "ReduceDims dimension mismatch. Operands could "
-              "not be broadcast together with the shape of x_dims = [%s] and "
-              "the shape of y_dims = [%s]. Received [%d] in X is not equal to "
+              "not be broadcast together with the shape of dout = [%s] and "
+              "the shape of in_dims = [%s]. Received [%d] in X is not equal to "
              "[%d] in Y at i:%d.",
-              x_dims,
-              y_dims,
-              x_dims[i + bat],
-              y_dims[i],
+              dout_dims,
+              in_dims,
+              dout_dims[i + bat],
+              in_dims[i],
              i));
    }
  }
-  auto res_dims = phi::make_ddim(result);
-  VLOG(4) << "Reduce Dims is: " << res_dims;
-  return res_dims;
+  return phi::make_ddim(result);
+}
+
+static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
+                                 const phi::DDim& y_dims) {
+  auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims);
+  return get_reduce_dims_from_out(out_dims, x_dims);
 }
+
 }  // namespace prim
 }  // namespace paddle
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -477,6 +477,9 @@ class GradCompositeOpMakerBase {
  void RecoverOutputName(const paddle::experimental::Tensor& output,
                         const std::string& origin_name) {
    if (origin_name == framework::kEmptyVarName) return;
+    VLOG(4) << "Recover: "
+            << static_cast<prim::DescTensor*>(output.impl().get())->Name()
+            << " To: " << origin_name;
    prim::StaticCompositeContext::Instance().GetBlock()->RenameVar(
        static_cast<prim::DescTensor*>(output.impl().get())->Name(),
        origin_name);

--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self,
                                       PyObject *kwargs) {
  PyThreadState *tstate = nullptr;
  try {
-    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
-    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
    auto OutScope =
        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self,
                                               PyObject* args,
                                               PyObject* kwargs) {
  EAGER_TRY
-  using Strings = std::vector<std::string>;
+  using Strings = paddle::framework::Strings;
  auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0);
  auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
  *var_tensor->GetMutable<Strings>() = strings;

--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -184,7 +184,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
      value[i] = ddim[i];
    }
  }
-
+  if (!egr::IsVariableCompatTensor(self->tensor)) {
    auto desired_layout =
        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
    auto default_layout =
@@ -200,17 +200,18 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
    std::vector<int64_t> dims = value;
    if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
      // NCHW -> NHWC
-    VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
-            << value[1] << " " << value[2] << " " << value[3] << " to "
+      VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0]
+              << " " << value[1] << " " << value[2] << " " << value[3] << " to "
              << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
      value[0] = dims[0];
      value[1] = dims[2];
      value[2] = dims[3];
      value[3] = dims[1];
-  } else if (change_dim && phi::DataLayoutToString(desired_layout) == "NHWC") {
+    } else if (change_dim &&
+               phi::DataLayoutToString(desired_layout) == "NHWC") {
      // NHWC -> NCHW
-    VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " "
-            << value[1] << " " << value[2] << " " << value[3] << " to "
+      VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0]
+              << " " << value[1] << " " << value[2] << " " << value[3] << " to "
              << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
              << " " << dims[1];
      value[0] = dims[0];
@@ -218,6 +219,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
      value[2] = dims[1];
      value[3] = dims[2];
    }
+  }

  return ToPyObject(value);
  EAGER_CATCH_AND_THROW_RETURN_NULL

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) {

  m.def("_cleanup_mmap_fds",
        []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
+
+  m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) {
+    memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize(
+        size);
+  });
 #endif

  m.def("start_imperative_gperf_profiler",

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle.
             }
           })
      .def("set_string_list",
-           [](Variable &self, Strings str_list) {
+           [](Variable &self, std::vector<std::string> str_list) {
             *self.GetMutable<Strings>() = str_list;
           })
      .def("set_vocab",
@@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("set_feed_variable",
        static_cast<void (*)(  // NOLINT
            Scope *,
-            const Strings &,
+            const std::vector<std::string> &,
            const std::string &,
            size_t)>(&framework::SetFeedVariable));
  m.def("get_fetch_variable",

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -182,6 +182,7 @@ limitations under the License. */
 #include "pybind11/stl.h"

 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_shm_cache);

 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
               int flags = memory::allocation::MAPPED_SHAREDMEM |
                           memory::allocation::MAPPED_EXCLUSIVE;
               std::string handle = memory::allocation::GetIPCName();
+               int find_id = -1;
+               if (FLAGS_use_shm_cache) {
+                 find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT
+               }
+               if (find_id != -1) {
+                 handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
+               }
               auto shared_holder =
                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                       handle, flags, data_size);
+                       handle, flags, data_size, find_id);

               // copy data & reset holder
               if (platform::is_cuda_pinned_place(holder->place())) {
@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             size_t size = t[1].cast<size_t>();
             int flags = memory::allocation::MAPPED_SHAREDMEM |
                         memory::allocation::MAPPED_NOCREATE;
-
+             int find_id = -1;
+             if (FLAGS_use_shm_cache) {
+               find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
+             }
             auto shared_holder =
                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                     ipc_name, flags, size);
+                     ipc_name, flags, size, find_id);

             // 3. Rebuild Tensor
             tensor.ResetHolderWithType(

--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/string_tensor_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/backends/device_manager.h"
+#endif

 namespace paddle {
 namespace experimental {
@@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) {

 BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
  if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (t.place().GetType() == AllocationType::CUSTOM) {
+      phi::DeviceManager::SetDevice(t.place());
+    }
+#endif
    phi::Backend backend_key = phi::TransToPhiBackend(t.place());
    BackendSet backend_set(backend_key);
    if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&

--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -431,6 +431,7 @@
  kernel :
    func : exp_grad
  inplace : (out_grad -> x_grad)
+  composite : exp_grad(out, out_grad, x_grad)

 - backward_op : expm1_grad
  forward : expm1 (Tensor x) -> Tensor(out)
@@ -1085,6 +1086,30 @@
    func : selu_grad
    data_type : out

+- backward_op : send_u_recv_grad
+  forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
+  args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param : [x]
+  kernel :
+    func : send_u_recv_grad
+    data_type : out_grad
+  optional: out, dst_count
+
+- backward_op : send_ue_recv_grad
+  forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) -> Tensor(out), Tensor(dst_count)
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : send_ue_recv_grad
+    data_type : out_grad
+  optional: out, dst_count
+
 - backward_op : send_uv_grad
  forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
  args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
@@ -1191,6 +1216,17 @@
    func : sinh_grad
  inplace : (out_grad -> x_grad)

+- backward_op : slogdet_grad
+  forward : slogdet (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param : [x]
+  kernel :
+    func : slogdet_grad
+    data_type : out_grad
+
 - backward_op : softplus_grad
  forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out)
  args : (Tensor x, Tensor out_grad, float beta, float threshold)

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -316,9 +316,14 @@

 - backward_op : cumsum_grad
  forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
-  args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
+  args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
  output : Tensor(x_grad)
-  invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : cumsum_grad
+    data_type: x

 - backward_op : deformable_conv_grad
  forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out)
@@ -475,6 +480,7 @@
    func : expand_grad
  no_need_buffer : x
  backward : expand_double_grad
+  composite: expand_grad(x, out_grad, shape, x_grad_p)

 - backward_op : exponential__grad
  forward : exponential_ (Tensor x, float lam) -> Tensor(out)
@@ -880,6 +886,7 @@
    param : [x, y]
  kernel :
    func : multiply_grad
+  composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
  backward : multiply_double_grad

 - backward_op : multiply_triple_grad
@@ -1201,30 +1208,6 @@
    data_type : x
  optional : summed_ids

- backward_op : send_u_recv_grad
-  forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
-  args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
-  output : Tensor(x_grad)
-  infer_meta :
-    func : GeneralUnaryGradInferMeta
-    param : [x]
-  kernel :
-    func : send_u_recv_grad
-    data_type : out_grad
-  optional: out, dst_count
-
- backward_op : send_ue_recv_grad
-  forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
-  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [x, y]
-  kernel :
-    func : send_ue_recv_grad
-    data_type : out_grad
-  optional: out, dst_count
-
 - backward_op : sigmoid_cross_entropy_with_logits_grad
  forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out)
  args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index)
@@ -1260,16 +1243,6 @@
  backward : slice_double_grad
  no_need_buffer : input

- backward_op : slogdet_grad
-  forward : slogdet (Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : slogdet_grad
-
 - backward_op : softmax_grad
  forward : softmax (Tensor x, int axis) -> Tensor(out)
  args : (Tensor out, Tensor out_grad, int axis)

--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1298,16 +1298,6 @@
  kernel :
    func : not_equal

- op : numel
-  args : (Tensor x)
-  output : Tensor(size)
-  infer_meta :
-    func : NumelInferMeta
-  kernel :
-    func : numel
-  data_transform:
-    skip_transform : x
-
 - op : one_hot
  args : (Tensor x, Scalar(int) num_classes)
  output : Tensor(out)
@@ -1588,28 +1578,6 @@
    data_type : x
  backward : segment_pool_grad

- op : send_u_recv
-  args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
-  output : Tensor(out), Tensor(dst_count)
-  infer_meta :
-    func : SendURecvInferMeta
-  kernel :
-    func : send_u_recv
-    data_type : x
-  intermediate : dst_count
-  backward : send_u_recv_grad
-
- op : send_ue_recv
-  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
-  output : Tensor(out), Tensor(dst_count)
-  infer_meta :
-    func : SendUERecvInferMeta
-  kernel :
-    func : send_ue_recv
-    data_type : x
-  intermediate : dst_count
-  backward : send_ue_recv_grad
-
 - op : sgd_
  args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
  output : Tensor(param_out), Tensor(master_param_out)
@@ -1663,15 +1631,6 @@
    func : slice
  backward : slice_grad

- op : slogdet
-  args : (Tensor x)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : slogdet
-  backward : slogdet_grad
-
 - op : softmax
  args : (Tensor x, int axis)
  output : Tensor(out)

--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -993,6 +993,12 @@
  outputs :
    {out : Out, total_weight : Total_weight}

+- op : numel(size)
+  inputs :
+    x : Input
+  outputs :
+    size : Out
+
 - op : overlap_add
  backward : overlap_add_grad
  inputs :
@@ -1215,6 +1221,28 @@
  outputs :
    out : Out

+- op : send_u_recv(graph_send_recv)
+  backward : send_u_recv_grad(graph_send_recv_grad)
+  inputs :
+    {x : X, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
+- op : send_ue_recv(graph_send_ue_recv)
+  backward : send_ue_recv_grad(graph_send_ue_recv_grad)
+  inputs :
+    {x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
 - op : send_uv (graph_send_uv)
  backward : send_uv_grad (graph_send_uv_grad)

@@ -1286,6 +1314,13 @@
  extra :
    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]

+- op : slogdet(slogdeterminant)
+  backward : slogdet_grad(slogdeterminant_grad)
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
 - op : softmax
  backward : softmax_grad
  inputs :

--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -871,6 +871,18 @@
  kernel :
    func : npu_identity

+- op : numel
+  args : (Tensor x)
+  output : Tensor(size)
+  infer_meta :
+    func : NumelInferMeta
+  kernel :
+    func : numel
+    data_type : x
+  data_transform:
+    skip_transform : x
+  no_need_buffer : x
+
 - op : overlap_add
  args: (Tensor x, int hop_length, int axis=-1)
  output: Tensor
@@ -1029,6 +1041,28 @@
    func : selu
  backward : selu_grad

+- op : send_u_recv
+  args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
+  output : Tensor(out), Tensor(dst_count)
+  infer_meta :
+    func : SendURecvInferMeta
+  kernel :
+    func : send_u_recv
+    data_type : x
+  intermediate : dst_count
+  backward : send_u_recv_grad
+
+- op : send_ue_recv
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0})
+  output : Tensor(out), Tensor(dst_count)
+  infer_meta :
+    func : SendUERecvInferMeta
+  kernel :
+    func : send_ue_recv
+    data_type : x
+  intermediate : dst_count
+  backward : send_ue_recv_grad
+
 - op : send_uv
  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
  output : Tensor(out)
@@ -1083,6 +1117,15 @@
    func : sinh
  backward : sinh_grad

+- op : slogdet
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : slogdet
+  backward : slogdet_grad
+
 - op : softplus
  args : (Tensor x, float beta = 1.0, float threshold = 20.0f)
  output : Tensor

--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() {
      {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
      {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"logical_not", XPUKernelSet({phi::DataType::BOOL})},
      {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
      {"masked_select",

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -61,11 +61,13 @@ struct XPUContext::Impl {
  ~Impl() {
    if (owned_ && context_ != nullptr) {
      backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+      xpu_wait(context_->xpu_stream);
+      if (context_->xpu_stream) {
        // manually destroy XPUStream here until xpu::api integrates this work
        // into Context dtor
-      xpu_wait(context_->xpu_stream);
        xpu_stream_destroy(context_->xpu_stream);
        context_->xpu_stream = nullptr;
+      }
      xpu::destroy_context(context_);
      context_ = nullptr;
    }
@@ -73,11 +75,7 @@ struct XPUContext::Impl {

  const Place& GetPlace() const { return place_; }

-  XPUStream stream() const {
-    auto s = context_->xpu_stream;
-    PD_CHECK(s != nullptr, "the xpu stream is nullptr.");
-    return s;
-  }
+  XPUStream stream() const { return context_->xpu_stream; }

  xpu::Context* GetXContext() const {
    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
@@ -103,13 +101,20 @@ struct XPUContext::Impl {
    context_ = xpu::create_context();
    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
    SetL3Cache();
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
  }

  void SetXContext(xpu::Context* context) { context_ = context; }

  void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }

+  void CreateStream() {
+    if (context_->xpu_stream) {
+      VLOG(3) << "xpu stream is already created for current context";
+      return;
+    }
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
+  }
+
  bool owned_{false};
  Place place_;
  backends::xpu::XPUVersion xpu_version_;
@@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
  impl_->SetBkclContext(context);
 }

+void XPUContext::CreateStream() { impl_->CreateStream(); }
+
 void XPUContext::Init() { impl_->Init(); }

 }  // namespace phi
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -46,6 +46,7 @@ class XPUContext : public DeviceContext,
  // Return bkcl context.
  xpu::BKCLContext_t bkcl_context() const;
  void SetBkclContext(xpu::BKCLContext_t context);
+  void CreateStream();

  // Wait for all operations completion in the stream.
  void Wait() const override;

--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    default: {
      size_t device_type_id_ = static_cast<size_t>(backend) -
                               static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
      if (!device_type.empty()) {
        os << device_type;
      } else {
@@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
    return Backend::IPU;
  } else {
    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
-                                phi::GetOrRegisterGlobalDeviceTypeId(s));
+                                phi::CustomRegisteredDeviceMap::Instance()
+                                    .GetOrRegisterGlobalDeviceTypeId(s));
  }
 }

@@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) {
    default:
      size_t device_type_id_ = static_cast<size_t>(backend) -
                               static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
      if (!device_type.empty()) {
        return device_type;
      } else {

--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #include <sstream>
 #include <string>
-#include <unordered_map>

 #include "glog/logging.h"
 #include "paddle/phi/api/ext/exception.h"
@@ -54,7 +53,8 @@ std::string Place::DebugString() const {
  std::ostringstream os;
  os << "Place(";
  if (alloc_type_ == AllocationType::CUSTOM) {
-    os << GetGlobalDeviceType(device_type_id_);
+    os << phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+        device_type_id_);
  } else {
    os << AllocationTypeStr(alloc_type_);
  }
@@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) {
  }
 }

-static std::unordered_map<std::string, size_t> global_registered_device_type_id;
-static std::unordered_map<size_t, std::string> global_registered_device_type;
+CustomRegisteredDeviceMap &CustomRegisteredDeviceMap::Instance() {
+  static CustomRegisteredDeviceMap g_custom_registered_device_map;
+  return g_custom_registered_device_map;
+}

-size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
+size_t CustomRegisteredDeviceMap::GetOrRegisterGlobalDeviceTypeId(
+    const std::string &device_type) {
  if (device_type.empty()) return 0;
-  if (global_registered_device_type_id.find(device_type) ==
-      global_registered_device_type_id.end()) {
-    size_t device_type_id = global_registered_device_type_id.size() + 1;
-    global_registered_device_type_id[device_type] = device_type_id;
-    global_registered_device_type[device_type_id] = device_type;
+  if (registered_device_type_id_.find(device_type) ==
+      registered_device_type_id_.end()) {
+    size_t device_type_id = registered_device_type_id_.size() + 1;
+    registered_device_type_id_[device_type] = device_type_id;
+    registered_device_type_[device_type_id] = device_type;
  }
-  return global_registered_device_type_id[device_type];
+  return registered_device_type_id_[device_type];
 }

-std::string GetGlobalDeviceType(size_t device_type_id) {
-  if (global_registered_device_type.find(device_type_id) ==
-      global_registered_device_type.end())
+std::string CustomRegisteredDeviceMap::GetGlobalDeviceType(
+    size_t device_type_id) {
+  if (registered_device_type_.find(device_type_id) ==
+      registered_device_type_.end())
    return "";
-  return global_registered_device_type[device_type_id];
+  return registered_device_type_[device_type_id];
 }

 constexpr static int kAllocationTypeBitLength = 8;
@@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType(
 Place::Place(paddle::PlaceType type)
    : device(detail::GetCorrectDeviceIdByPlaceType(type)),
      alloc_type_(static_cast<AllocationType>(type)),
-      device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) {
+      device_type_id_(
+          CustomRegisteredDeviceMap::Instance().GetOrRegisterGlobalDeviceTypeId(
+              "")) {
  LOG_FIRST_N(WARNING, 1)
      << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version "
         "2.3, and will be removed in version 2.4! Please use "

--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <unordered_map>

 #include "paddle/phi/api/include/dll_decl.h"

@@ -37,11 +38,21 @@ enum class AllocationType : int8_t {
  CUSTOM = 9,
 };

-const char* AllocationTypeStr(AllocationType type);
+class CustomRegisteredDeviceMap {
+ public:
+  static CustomRegisteredDeviceMap& Instance();

-size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
+  size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);

-std::string GetGlobalDeviceType(size_t device_type_id_);
+  std::string GetGlobalDeviceType(size_t device_type_id_);
+
+ private:
+  CustomRegisteredDeviceMap() = default;
+  std::unordered_map<std::string, size_t> registered_device_type_id_;
+  std::unordered_map<size_t, std::string> registered_device_type_;
+};
+
+const char* AllocationTypeStr(AllocationType type);

 /// \brief The place is used to specify where the data is stored.
 class PADDLE_API Place {
@@ -53,12 +64,14 @@ class PADDLE_API Place {
                 const std::string& dev_type = "")
      : device(id),
        alloc_type_(type),
-        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+        device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type)) {}

  explicit Place(AllocationType type, const std::string& dev_type = "")
      : device(0),
        alloc_type_(type),
-        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+        device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type)) {}

  // See NOTE [ Why need to temporarily adapt to PlaceType? ]
  Place(paddle::PlaceType type);  // NOLINT
@@ -69,7 +82,8 @@ class PADDLE_API Place {
    alloc_type_ = type;
    device = device_id;
    if (!dev_type.empty()) {
-      device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
+      device_type_id_ = phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type);
    }
  }

@@ -78,7 +92,8 @@ class PADDLE_API Place {
  int8_t GetDeviceId() const { return device; }

  std::string GetDeviceType() const {
-    return GetGlobalDeviceType(device_type_id_);
+    return phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+        device_type_id_);
  }

  std::string DebugString() const;

--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -110,6 +110,7 @@ class ArgumentMappingContext {
  virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
  virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
  virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
+  virtual bool IsSparseCooTensorOutput(const std::string& name) const = 0;
  virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
  // For compatibility with LoDTensorArray
  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;

--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
    case AllocationType::CUSTOM:
      return static_cast<Backend>(
          static_cast<size_t>(Backend::NUM_BACKENDS) +
-          GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
+          phi::CustomRegisteredDeviceMap::Instance()
+              .GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
    default:
      PADDLE_THROW(phi::errors::InvalidArgument(
          "Unsupported transform %s to phi Backend.", place));
@@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
      size_t device_type_id_ = static_cast<size_t>(backend) -
                               static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
      if (!device_type.empty()) {
        return phi::CustomPlace(
            device_type,

--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,

 #endif

+/*
+ * CUDA Graph related FLAG
+ * Name: FLAGS_new_executor_use_cuda_graph
+ * Since Version: 2.4
+ * Value Range: bool, default=false
+ * Example: FLAGS_new_executor_use_cuda_graph=true would allow
+ * new executor to use CUDA Graph.
+ */
+PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
+                            false,
+                            "Use CUDA Graph in new executor");
+
 DEFINE_int32(record_pool_max_size,
             2000000,
             "SlotRecordDataset slot record pool max size");
@@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
 PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
                            false,
                            "Add a persistent ibuilder.");
+
+/**
+ * mmap_allocator related FLAG
+ * Name: use_shm_cache
+ * Since Version: 2.5.0
+ * Value Range: bool, default=true
+ * Example:
+ * Note: . If True, mmap_allocator will cache shm file to decrease munmap
+ * operation.
+ */
+PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
+                            true,
+                            "Use shm cache in mmap_allocator.");
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -101,6 +101,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::ExtendedTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
      } else if (arg_type == std::type_index(typeid(
                                 const std::vector<const ExtendedTensor*>&))) {
        args_def->AppendInput(default_key.backend(),

--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
--- a/paddle/phi/kernels/cpu/cum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/cum_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_kernel.cc
--- a/paddle/phi/kernels/cum_grad_kernel.h
+++ b/paddle/phi/kernels/cum_grad_kernel.h
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
--- a/paddle/phi/kernels/xpu/cum_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/cum_grad_kernel.cc
--- a/paddle/phi/kernels/xpu/cum_kernel.cc
+++ b/paddle/phi/kernels/xpu/cum_kernel.cc
--- a/paddle/phi/kernels/xpu/logical_kernel.cc
+++ b/paddle/phi/kernels/xpu/logical_kernel.cc
--- a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
--- a/paddle/phi/ops/compat/size_sig.cc
+++ b/paddle/phi/ops/compat/size_sig.cc
--- a/paddle/phi/ops/compat/slogdeterminant_sig.cc
+++ b/paddle/phi/ops/compat/slogdeterminant_sig.cc
--- a/paddle/phi/ops/compat/graph_send_recv_sig.cc
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
--- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc
+++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
--- a/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
--- a/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
--- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py
--- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py
--- a/python/paddle/fluid/tests/unittests/composite_ops/utils.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/utils.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_case.py
+++ b/python/paddle/fluid/tests/unittests/test_case.py
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_histogram_op.py
+++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
--- a/python/paddle/fluid/tests/unittests/test_switch_case.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_case.py
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
--- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
--- a/python/paddle/static/amp/__init__.py
+++ b/python/paddle/static/amp/__init__.py
--- a/python/paddle/static/amp/amp_nn.py
+++ b/python/paddle/static/amp/amp_nn.py
--- a/python/paddle/static/amp/bf16/__init__.py
+++ b/python/paddle/static/amp/bf16/__init__.py
--- a/python/paddle/static/amp/bf16/amp_lists.py
+++ b/python/paddle/static/amp/bf16/amp_lists.py
--- a/python/paddle/static/amp/bf16/amp_utils.py
+++ b/python/paddle/static/amp/bf16/amp_utils.py
--- a/python/paddle/static/amp/bf16/decorator.py
+++ b/python/paddle/static/amp/bf16/decorator.py
--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
--- a/python/paddle/static/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py