diff --git a/README.md b/README.md
index f058b20e1f0322b2fabd39bca1f61dc6f2a4a552..78396610c07933122675a8f9a9fee99007213135 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+[![Twitter](https://img.shields.io/badge/Twitter-1ca0f1.svg?logo=twitter&logoColor=white)](https://twitter.com/PaddlePaddle_)
 
 Welcome to the PaddlePaddle GitHub.
 
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index c7a4e1d99bff16feff685c6da98ef72cdd9d89b7..aa8ab62d7ae28197e9f18b83440f75c8c68d8fff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 # in case of low internet speed
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
 
 set(WARPCTC_INCLUDE_DIR
     "${WARPCTC_INSTALL_DIR}/include"
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index e626669d2a73af1cb7125663a78c128d62ba28ae..8e08eb84b9f3577dc072f8ffe31d231e03cfea90 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT)
       PARENT_SCOPE)
 endfunction()
 
-function(find_phi_register FILENAME ADD_PATH)
+function(find_phi_register FILENAME ADD_PATH PATTERN)
   # set op_name to OUTPUT
   set(options "")
   set(oneValueArgs "")
@@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH)
   string(
     REGEX
       MATCH
-      "PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
+      "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
       register
       "${CONTENT}")
   if(NOT register STREQUAL "")
-    string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}")
+    string(REPLACE "${PATTERN}(" "" register "${register}")
     string(REPLACE "," ";" register "${register}")
     string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
     string(REGEX REPLACE "//cuda_only" "" register "${register}")
@@ -401,7 +401,8 @@ function(op_library TARGET)
     # pybind USE_OP_ITSELF
     set(op_name "")
     # Add PHI Kernel Registry Message
-    find_phi_register(${cc_src} ${pybind_file})
+    find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
     find_register(${cc_src} "REGISTER_OPERATOR" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
@@ -440,7 +441,8 @@ function(op_library TARGET)
   foreach(cu_src ${cu_srcs})
     set(op_name "")
     # Add PHI Kernel Registry Message
-    find_phi_register(${cu_src} ${pybind_file})
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
     find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 6fac4d2c64080d7af7d200696c4430dcf91f96f0..602f966cf8ebc2c88d87903b956faa1bc40107e5 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -446,7 +446,8 @@ endif()
 
 if(WITH_DISTRIBUTE
    AND NOT WITH_PSLIB
-   AND NOT WITH_PSCORE)
+   AND NOT WITH_PSCORE
+   AND NOT WITH_RPC)
   include(external/snappy)
   list(APPEND third_party_deps extern_snappy)
 
diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
index 5a8d4bca1cc77b15a9c31f3fc2e1207949f615d5..de4a84bff4808f96ba5fecb3bb7f1fece54c3870 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/distributed/collective/bkcl_tools.h"
 #include "paddle/fluid/distributed/collective/common.h"
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/phi/core/device_context.h"
@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() {
   PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end());
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
+    phi::DenseTensor* tensor,
+    int src_rank,
+    int64_t offset,
+    int64_t numel,
+    bool sync_op,
+    bool use_calc_stream) {
+  // numel > 0 indicates the tensor need to be sliced
+  phi::DenseTensor partial_tensor;
+  if (numel > 0) {
+    partial_tensor = GetPartialTensor(*tensor, offset, numel);
+    tensor = &partial_tensor;
+  }
+
+  return Collective(
+      tensor,
+      // have to pass a tensor here
+      // TODO(zhangxiaoci) catch up with nccl's api
+      *tensor,
+      [&](phi::DenseTensor* output,
+          const phi::DenseTensor& input,
+          BKCLContext_t comm,
+          const XPUStream& stream) {
+        return bkcl_recv(comm,
+                         output->data(),
+                         output->numel(),
+                         src_rank,
+                         platform::ToBKCLDataType(
+                             framework::TransToProtoVarType(output->type())),
+                         stream);
+      },
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
+    const phi::DenseTensor& tensor,
+    int dst_rank,
+    int64_t offset,
+    int64_t numel,
+    bool sync_op,
+    bool use_calc_stream) {
+  // numel > 0 indicates the tensor need to be sliced
+  const phi::DenseTensor& tensor_maybe_partial =
+      numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor;
+
+  return Collective(
+      nullptr,
+      tensor_maybe_partial,
+      [&](phi::DenseTensor* output,
+          const phi::DenseTensor& input,
+          BKCLContext_t comm,
+          const XPUStream& stream) {
+        return bkcl_send(comm,
+                         input.data(),
+                         input.numel(),
+                         dst_rank,
+                         platform::ToBKCLDataType(
+                             framework::TransToProtoVarType(input.type())),
+                         stream);
+      },
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroupBKCL::BKCLTask> ProcessGroupBKCL::CreateTask(
     const Place& place,
     int rank,
@@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place,
   BKCLContext_t bkcl_comm;
   BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id));
   comm_ctx->SetBkclContext(bkcl_comm);
+  // comm context creates a separate XPU stream for communication
+  comm_ctx->CreateStream();
 
   place_to_calc_ctx_[place_key] = calc_ctx;
   place_to_comm_ctx_[place_key] = std::move(comm_ctx);
diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.h b/paddle/fluid/distributed/collective/process_group_bkcl.h
index cf8c983d8e66a884a54d2426b03685102140ebfb..1ecf8c9c0ff96af52038fa6be540263de913297b 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.h
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.h
@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
   phi::DeviceContext* GetDeviceContext(const Place& place,
                                        bool use_calc_stream) const override;
 
-  std::shared_ptr<ProcessGroup::Task> AllReduce(
+  std::shared_ptr<ProcessGroup::Task> AllGather(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
-      const AllreduceOptions& opts,
+      int64_t offset,  // for compatibility, no use now
+      int64_t numel,   // for compatibility, no use now
       bool sync_op,
       bool use_calc_stream) override;
 
-  std::shared_ptr<ProcessGroup::Task> Broadcast(
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
-      const BroadcastOptions& opts,
+      const AllreduceOptions& opts,
       bool sync_op,
       bool use_calc_stream) override;
 
-  std::shared_ptr<ProcessGroup::Task> AllGather(
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
-      int64_t offset,  // for compatibility, no use now
-      int64_t numel,   // for compatibility, no use now
+      const BroadcastOptions& opts,
       bool sync_op,
       bool use_calc_stream) override;
 
@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
                                              bool sync_op,
                                              bool use_calc_stream) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                           int src_rank,
+                                           int64_t offset,
+                                           int64_t numel,
+                                           bool sync_op,
+                                           bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
+                                           int dst_rank,
+                                           int64_t offset,
+                                           int64_t numel,
+                                           bool sync_op,
+                                           bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
index 72c689732b5b7df5f61d28d93a3bef6e305f426d..a166ff0b6dfa2f381da02ff0e90dadc08732de5e 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() {
   // run_per_steps_, run_at_offset_
   // 4, 0 --> run at step 0, 4, 8, 12
   // 4, 3 --> run at step 3, 7, 11, 15
-  if ((step_ % run_per_steps_) == run_at_offset_) {
+  if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) {
     ComputeInterceptor::RunOps();
   }
 }
@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() {
 void AmplifierInterceptor::SendDataReadyToDownStream() {
   // run multi times, send ready one times to downstream, that is
   // input multi times, output one times
-  if (step_ % send_down_per_steps_ == 0) {
+  if (cur_scope_id_ % send_down_per_steps_ == 0) {
     ComputeInterceptor::SendDataReadyToDownStream();
   }
 }
@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() {
 void AmplifierInterceptor::ReplyCompletedToUpStream() {
   // run multi times, reply one times to upstream, that is
   // input one times, output multi times
-  if (step_ % reply_up_per_steps_ == 0) {
+  if (cur_scope_id_ % reply_up_per_steps_ == 0) {
     ComputeInterceptor::ReplyCompletedToUpStream();
   }
 }
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
index 776aa8d3e88db10d551d6fd0180a5da9d6a6f3db..93e8ffa1d75aecc063b05fff84545238e7a1fba2 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace distributed {
 
-class AmplifierInterceptor : public ComputeInterceptor {
+class AmplifierInterceptor final : public ComputeInterceptor {
  public:
   AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 6fb0d55a4859ef39d04857d39d1e70f6a31bb4a3..3449c87998a9dba21824e854afdb7216cb818164 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -71,6 +71,9 @@ void Carrier::Init(
     microbatch_scopes_[i] = &minibatch_scope_->NewScope();
     CopyParameters(i, program, inference_root_scope_vars);
   }
+  // Add source and sink interceptor id to rank
+  interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
+  interceptor_id_to_rank_.emplace(SINK_ID, rank);
 
   // TODO(fleet_exe dev): thread pool
   thread_num_ = 1;
@@ -159,16 +162,10 @@ void Carrier::Start() {
                     true,
                     platform::errors::PreconditionNotMet(
                         "Using carrier before initialized."));
-  for (int64_t id : source_interceptor_ids_) {
-    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
-            << ".";
-    InterceptorMessage start_msg;
-    // source node data_is_ready is send by carrier, so set src_id=-1
-    start_msg.set_src_id(-1);
-    start_msg.set_dst_id(id);
-    start_msg.set_message_type(DATA_IS_READY);
-    Send(start_msg);
-  }
+  InterceptorMessage start_msg;
+  start_msg.set_dst_id(SOURCE_ID);
+  start_msg.set_message_type(START);
+  Send(start_msg);
   // TODO(wangxi): async step
   Wait();
   dev_ctx_->Wait();
@@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() {
 
   auto gc = GetGC(place_);
 
+  // create source and sink task node
+  auto max_run_times = microbatch_scopes_.size();
+  TaskNode* source = new TaskNode(
+      rank_, SOURCE_ID, max_run_times);  // rank, task_id, max_run_times
+  TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times);
+  // find nodes without upstreams or without downstreams
+  std::vector<TaskNode*> origin_sources, origin_sinks;
+  for (const auto& item : interceptor_id_to_node_) {
+    TaskNode* task_node = item.second;
+    if (task_node->upstream().empty()) {
+      origin_sources.emplace_back(task_node);
+    }
+    if (task_node->downstream().empty()) {
+      origin_sinks.emplace_back(task_node);
+    }
+  }
+  // link source node with origin source
+  for (const auto& node : origin_sources) {
+    source->AddDownstreamTask(node->task_id(),
+                              std::numeric_limits<int64_t>::max());
+    node->AddUpstreamTask(SOURCE_ID, std::numeric_limits<int64_t>::max());
+  }
+  // link sink node with origin sink
+  for (const auto& node : origin_sinks) {
+    sink->AddUpstreamTask(node->task_id(), std::numeric_limits<int64_t>::max());
+    node->AddDownstreamTask(SINK_ID, std::numeric_limits<int64_t>::max());
+  }
+  // create source and sink interceptor
+  SetInterceptor(SOURCE_ID,
+                 InterceptorFactory::Create("Source", SOURCE_ID, source));
+  SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink));
+
   // create each Interceptor
   // no auto init since there is no config
   for (const auto& item : interceptor_id_to_node_) {
@@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() {
     VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
             << " with type: " << task_node->type() << ".";
 
-    if (task_node->upstream().empty()) {
-      source_interceptor_ids_.emplace_back(interceptor_id);
-    }
+    PADDLE_ENFORCE_EQ(
+        task_node->upstream().empty(),
+        false,
+        platform::errors::PreconditionNotMet(
+            "There should not have normal nodes as source nodes"));
+    PADDLE_ENFORCE_EQ(task_node->downstream().empty(),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "There should not have normal nodes as sink nodes"));
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index fe3d492676655838f6f077718ef65681bcdb53cb..2523942e06223f6210461a625a1a3bce2dcedb92 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -100,8 +100,6 @@ class Carrier final {
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
       interceptor_idx_to_interceptor_;
 
-  std::vector<int64_t> source_interceptor_ids_;
-
   bool is_init_{false};
 
   std::mutex running_mutex_;
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 5b96ee76e7144692bad974c14a2bce1f6ae2f3b4..5017f81523c8aea31fb8732e001e4af311313d32 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() {
 
   for (auto up : upstream) {
     in_readys_.emplace(up.first, std::make_pair(up.second, 0));
-    in_stops_.emplace(up.first, false);
   }
   for (auto down : downstream) {
     out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
   }
-
-  // source compute node, should we add a new SourceInterceptor?
-  if (upstream.empty()) {
-    is_source_ = true;
-    PADDLE_ENFORCE_GT(node_->max_run_times(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Source ComputeInterceptor must run at least one "
-                          "times, but now max_run_times=%ld",
-                          node_->max_run_times()));
-    in_readys_.emplace(-1,
-                       std::make_pair(std::numeric_limits<int64_t>::max(), 0));
-  }
-
-  // If there is no downstream or every downstream is in different rank,
-  // then this interceptor is the last one for current rank.
-  // This can be get during init, can be cached for later use.
-  is_last_ = downstream.empty();
 }
 
 void ComputeInterceptor::IncreaseReady(int64_t up_id) {
@@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
                     platform::errors::NotFound(
                         "Cannot find upstream=%lld in in_readys.", up_id));
 
-  // source node has no upstream, data_is_ready is send by carrier or others
-  if (is_source_ && up_id == -1) {
-    it->second.second += GetTaskNode()->max_run_times();
-    return;
-  }
-
   auto max_ready_size = it->second.first;
   auto ready_size = it->second.second;
   ready_size += 1;
@@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
     ready_msg.set_message_type(DATA_IS_READY);
     VLOG(3) << "ComputeInterceptor " << interceptor_id_
             << " Send data_is_ready msg to " << down_id
-            << " for step: " << step_;
+            << " in scope: " << cur_scope_id_;
     Send(down_id, ready_msg);
   }
 }
@@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 
     VLOG(3) << "ComputeInterceptor " << interceptor_id_
             << " Reply data_is_useless msg to " << up_id
-            << " for step: " << step_;
-    if (is_source_ && up_id == -1) return;
+            << " in scope: " << cur_scope_id_;
 
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATA_IS_USELESS);
@@ -183,16 +157,20 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
-  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
-          << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
-    op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+    PADDLE_ENFORCE_LT(cur_scope_id_,
+                      microbatch_scopes_.size(),
+                      platform::errors::InvalidArgument(
+                          "Step out of range. There are %ld "
+                          "microbatch_scopes, but recevice scope index %ld",
+                          microbatch_scopes_.size(),
+                          cur_scope_id_));
+    op->Run(*microbatch_scopes_[cur_scope_id_], place_);
     if (gc_) {
-      framework::DeleteUnusedTensors(
-          *microbatch_scopes_[step_ % node_->max_run_times()],
-          op,
-          node_->unused_vars(),
-          gc_.get());
+      framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
+                                     op,
+                                     node_->unused_vars(),
+                                     gc_.get());
     }
   }
 }
@@ -201,77 +179,28 @@ void ComputeInterceptor::Run() {
   while (IsInputReady() && CanWriteOutput()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
+    // get the ready scope id from queue
+    cur_scope_id_ = ready_queue_.front();
+    ready_queue_.pop();
+
     RunOps();
-    ++step_;
 
     // send to downstream and increase buff used
     SendDataReadyToDownStream();
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
-    // Try to stop Carrier
-    if (is_last_ && (step_ % node_->max_run_times() == 0)) {
-      VLOG(3) << "Interceptor " << GetInterceptorId()
-              << " is stopping carrier.";
-      // FIXME(wangxi): with multi sink interceptor
-      StopCarrier();
-    }
-  }
-}
-
-void ComputeInterceptor::ReceivedStop(int64_t up_id) {
-  received_stop_ = true;
-
-  // source node has no upstream, stop is send by carrier or others
-  if (is_source_ && up_id == -1) return;
-
-  auto it = in_stops_.find(up_id);
-  PADDLE_ENFORCE_NE(it,
-                    in_stops_.end(),
-                    platform::errors::NotFound(
-                        "Cannot find upstream=%lld in in_stops.", up_id));
-  PADDLE_ENFORCE_EQ(
-      it->second,
-      false,
-      platform::errors::AlreadyExists("Already received stop from %lld, stop "
-                                      "cannot be send more than once."));
-  it->second = true;
-}
-
-void ComputeInterceptor::TryStop() {
-  if (!received_stop_) return;
-
-  // can stop only when all upstream is stop and
-  // downstream complete
-  for (auto& in_stop : in_stops_) {
-    if (!in_stop.second) return;
-  }
-  for (auto& out_buff : out_buffs_) {
-    auto used_size = out_buff.second.second;
-    if (used_size != 0) return;
   }
-
-  // send stop to downstream
-  for (auto& out : out_buffs_) {
-    auto down_id = out.first;
-    InterceptorMessage stop;
-    stop.set_message_type(STOP);
-    Send(down_id, stop);
-  }
-  stop_ = true;
 }
 
 void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
   if (msg.message_type() == DATA_IS_READY) {
     IncreaseReady(msg.src_id());
+    ready_queue_.push(msg.scope_idx());
     Run();
   } else if (msg.message_type() == DATA_IS_USELESS) {
     DecreaseBuff(msg.src_id());
     Run();
-  } else if (msg.message_type() == STOP) {
-    ReceivedStop(msg.src_id());
   }
-
-  TryStop();
 }
 
 REGISTER_INTERCEPTOR(Compute, ComputeInterceptor);
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index fb82ce76c7bdb851c32b1959121059cfca041b94..9709cd4437f1019fea80cf04ecce5a38f74bb463 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <queue>
 #include <utility>
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor {
   virtual void SendDataReadyToDownStream();
   virtual void ReplyCompletedToUpStream();
 
-  int64_t step_{0};
+  std::queue<int64_t> ready_queue_;
+  int64_t cur_scope_id_;
 
  private:
   void PrepareDeps();
@@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor {
   void Run();
   void Compute(const InterceptorMessage& msg);
 
-  void ReceivedStop(int64_t up_id);
-  void TryStop();
-
-  bool is_source_{false};
-  bool is_last_{false};
-
   // upstream_id-->(max_ready_size, ready_size)
   std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
   // downstream_id-->(max_buffer_size, used_size)
   std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
-
-  bool received_stop_{false};
-  std::map<int64_t, bool> in_stops_{};
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index 6a761072027a924f21b38f7a694bba65b77e425d..2c20e1ad6113ecda58404429697fa4077fece492 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -93,7 +93,6 @@ class Interceptor {
   TaskNode* node_;
 
   // for stop
-  bool stop_{false};
   void StopCarrier();
 
   // for runtime
@@ -114,9 +113,6 @@ class Interceptor {
 
   std::mutex mutex_;
   std::deque<InterceptorMessage> messages_;
-
-  int64_t already_run_times_{0};
-  int64_t used_slot_nums_{0};
 };
 
 class InterceptorFactory {
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
index cb1d698a78526fdde61586304e588e8009340584..1abb7a641e23a5237570b9f469009f4fa3fb72a7 100644
--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
  *   1. record the num of micro-step
  *   2. check whether to notify carrier the current step is finished
  */
-class SinkInterceptor : public Interceptor {
+class SinkInterceptor final : public Interceptor {
  public:
   SinkInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.h b/paddle/fluid/distributed/fleet_executor/source_interceptor.h
index f8b18fb1848645c44c75db90a7d123ba48aeae21..95e8c1b3b03781a653152219a73e6b590cced631 100644
--- a/paddle/fluid/distributed/fleet_executor/source_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
  *   1. receive `start` message from carrier
  *   2. send num_of_steps `data_is_ready` message to downstream
  */
-class SourceInterceptor : public Interceptor {
+class SourceInterceptor final : public Interceptor {
  public:
   SourceInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 4992a8b34c9da163af6bb64cad0094da9142afb2..e484031161489f4e6cd54403fbd15da0128433e8 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -25,57 +25,42 @@ limitations under the License. */
 namespace paddle {
 namespace distributed {
 
-class StartInterceptor : public Interceptor {
- public:
-  StartInterceptor(int64_t interceptor_id, TaskNode* node)
-      : Interceptor(interceptor_id, node) {
-    RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
-  }
-
-  void NOP(const InterceptorMessage& msg) {
-    if (msg.message_type() == STOP) {
-      stop_ = true;
-      InterceptorMessage stop;
-      stop.set_message_type(STOP);
-      Send(1, stop);  // stop 1, compute
-      return;
-    }
-    std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
-              << std::endl;
-  }
-};
-
 TEST(ComputeInterceptor, Compute) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});
 
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 3);  // rank, task_id, max_run_times
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);
   TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
+  TaskNode* sink = new TaskNode(0, SINK_ID, 3);
 
-  // a->b->c
+  // source->a->b->sink
+  source->AddDownstreamTask(0);
+  node_a->AddUpstreamTask(SOURCE_ID);
   node_a->AddDownstreamTask(1, 3);
   node_b->AddUpstreamTask(0, 3);
-  node_b->AddDownstreamTask(2);
-  node_c->AddUpstreamTask(1);
+  node_b->AddDownstreamTask(SINK_ID);
+  sink->AddUpstreamTask(1);
 
-  Interceptor* a =
-      carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
+  carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
   carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
-  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
+  // start
   InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  // test run three times
-  a->Send(1, msg);
-  a->Send(1, msg);
-  a->Send(1, msg);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
+  carrier->EnqueueInterceptorMessage(msg);
 
   carrier->Wait();
   carrier->Release();
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 54adf06fb67ddf6e5d9ac803b3aa097289c33c38..f43f3860199fb772bc5d4537a41490a70c8270e5 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor {
 
   void PingPong(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
-      stop_ = true;
       return;
     }
     std::cout << GetInterceptorId() << " recv msg, count=" << count_
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index 3828c4478cbe6eecad18a88ce5501eae84eb0589..62c23068d7d4a9eb6574aacc53d0c258ae2ddc51 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor {
 
   void PingPong(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
-      stop_ = true;
       StopCarrier();
       return;
     }
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 99f5e789081ba69794f189aeff0cfa8e72f7d34a..d272055d5deda8b01cb58b6a4279c56bc4ce224a 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -22,6 +22,10 @@ if(WITH_ROCM)
   target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
 endif()
 
+if(WITH_CINN)
+  target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
+endif()
+
 # Prepare file structure
 message(
   "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index aa3695e7fbf4fe3e313458179eee83bbdd40fcdc..f21ca0c858acc0245333985e4e4d8c52421aa57e 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope,
 }
 
 void SetFeedVariable(Scope* scope,
-                     const Strings& input,
+                     const std::vector<std::string>& input,
                      const std::string& var_name,
                      size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
@@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope,
     feed_inputs.resize(index + 1);
   }
   // shared data with input tensor
-  feed_inputs[index] = input;
+  feed_inputs[index] = Strings(input);
 }
 
 FetchType& GetFetchVariable(const Scope& scope,
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 356e6d3ddf8cfd6802bc1fe192b6c134b8a94f73..b3d5c91994db5caf9ec1773fdf242f4bd7be6d8b 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope,
                      size_t index);
 
 void SetFeedVariable(Scope* scope,
-                     const Strings& input,
+                     const std::vector<std::string>& input,
                      const std::string& var_name,
                      size_t index);
 
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index 981a303cd191590546eb75618cc36b4abee63e73..571667bff47eb454c0366398faa6d04d84448219 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -19,12 +19,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/string_array.h"
+#include "paddle/phi/core/extended_tensor.h"
 
 namespace paddle {
 namespace framework {
 using FeedType =
     paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
-using FeedList = std::vector<FeedType>;
+
+using FeedList = paddle::framework::PhiVector<FeedType>;
 
 using FetchType = paddle::variant<phi::DenseTensor,
                                   LoDTensorArray,
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index ea77f352dc95228b7e8a8988793ebfe8133b3024..1709334d569d3c026d52ded589b688fe86691e75 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -117,6 +117,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_type == proto::VarType::SPARSE_COO;
   }
 
+  bool IsSparseCooTensorOutput(const std::string& name) const override {
+    auto var_types = ctx_.GetOutputsVarType(name);
+    return std::all_of(var_types.begin(),
+                       var_types.end(),
+                       [](const proto::VarType::Type& type) {
+                         return type == proto::VarType::SPARSE_COO;
+                       });
+  }
+
   bool IsSparseCsrTensorInput(const std::string& name) const override {
     auto var_type = ctx_.GetInputVarType(name);
     return var_type == proto::VarType::SPARSE_CSR;
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 43fbb7d550ee65c2fa4d36916f56250cf6af9358..b8acd358567e5212b042c690fd52d15727b7d79c 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -126,6 +126,45 @@ void InferShapeUtilsTestKernel(const Context& dev_ctx,
   VLOG(6) << "Come into InferShapeUtilsTestKernel";
 }
 
+void TestOutputInferMeta(const phi::MetaTensor& x, phi::MetaTensor* out) {
+  ASSERT_EQ(x.dtype(), phi::DataType::FLOAT32);
+}
+
+class InferShapeUtilsTestOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddComment("This is test op");
+  }
+};
+
+class InferShapeUtilsTestOutputOp : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+  phi::KernelKey GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return phi::KernelKey(proto::VarType::FP32, ctx.GetPlace());
+  }
+};
+
+phi::KernelSignature TestSparseOutputOpArgumentMapping(
+    const phi::ArgumentMappingContext& ctx) {
+  if (ctx.IsSparseCooTensorOutput("Out")) {
+    return phi::KernelSignature(
+        "test_sparse_coo_tensor_output", {"X"}, {}, {"Out"});
+  }
+  return phi::KernelSignature("test_output", {"X"}, {}, {"Out"});
+}
+
+template <typename T, typename Context>
+void InferShapeUtilsTestOutputKernel(const Context& dev_ctx,
+                                     const phi::DenseTensor& x,
+                                     phi::SparseCooTensor* out) {
+  VLOG(6) << "Come into InferShapeUtilsTestOutputKernel";
+}
+
 }  // namespace framework
 }  // namespace paddle
 
@@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test,
                    paddle::framework::InferShapeUtilsTestKernel,
                    int) {}
 
+DECLARE_INFER_SHAPE_FUNCTOR(
+    infer_shape_utils_test_output,
+    InferShapeUtilsTestOutputInferShapeFunctor,
+    PD_INFER_META(paddle::framework::TestOutputInferMeta));
+REGISTER_OPERATOR(infer_shape_utils_test_output,
+                  paddle::framework::InferShapeUtilsTestOutputOp,
+                  paddle::framework::InferShapeUtilsTestOutputOpMaker,
+                  InferShapeUtilsTestOutputInferShapeFunctor);
+
+PD_REGISTER_KERNEL(test_sparse_coo_tensor_output,
+                   CPU,
+                   ALL_LAYOUT,
+                   paddle::framework::InferShapeUtilsTestOutputKernel,
+                   int) {}
+
 TEST(InferShapeUtilsTest, ALL) {
   paddle::framework::ProgramDesc prog;
   paddle::framework::proto::BlockDesc proto_block;
@@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) {
 
   op->InferShape(block_desc);
 }
+
+TEST(InferShapeUtilsTestOutput, ALL) {
+  paddle::framework::ProgramDesc prog;
+  paddle::framework::proto::BlockDesc proto_block;
+  paddle::framework::BlockDesc block_desc(&prog, &proto_block);
+
+  auto* op = block_desc.AppendOp();
+  op->SetType("infer_shape_utils_test_output");
+
+  auto* x = block_desc.Var("x");
+  x->SetType(paddle::framework::proto::VarType::LOD_TENSOR);
+  x->SetDataType(paddle::framework::proto::VarType::FP32);
+  op->SetInput("X", {"x"});
+
+  auto* out = block_desc.Var("out");
+  out->SetType(paddle::framework::proto::VarType::SPARSE_COO);
+  op->SetOutput("Out", {"out"});
+
+  phi::OpUtilsMap::Instance().InsertArgumentMappingFn(
+      "infer_shape_utils_test_output",
+      paddle::framework::TestSparseOutputOpArgumentMapping);
+
+  op->InferShape(block_desc);
+}
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6eda1f4b23f8e57cbadfa30cd99d0a2514a87be9..b387dc1d6cc26b5cd0cf6ea45014f8986c4035ab 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -460,14 +460,6 @@ if(WITH_MKLDNN)
     test_cpu_quantize_squash_pass
     SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
     DEPS cpu_quantize_squash_pass naive_executor)
-  cc_test(
-    test_reshape_transpose_matmul_mkldnn_fuse_pass
-    SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
-    DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
-  cc_test(
-    test_matmul_transpose_reshape_fuse_pass
-    SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
-    DEPS matmul_transpose_reshape_mkldnn_fuse_pass)
   cc_test(
     test_shuffle_channel_mkldnn_detect_pass
     SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 4149bb2347317a00cd3094bd535ddb9ae704463d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc *prog,
-           const std::string &type,
-           const std::vector<std::string> &inputs,
-           const std::vector<std::string> &outputs) {
-  auto *op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetInput("X", {inputs[0]});
-  op->SetOutput("Out", {outputs[0]});
-  if (type == "transpose2") {
-    op->SetAttr("axis", std::vector<int>({0, 2, 1, 3}));
-    op->SetOutput("XShape", {outputs[1]});
-  }
-  if (type == "reshape2") {
-    op->SetAttr("shape", std::vector<int>({4, 5, 6}));
-    op->SetOutput("XShape", {outputs[1]});
-  }
-
-  if (type == "matmul") {
-    op->SetInput("Y", {inputs[1]});
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("alpha", 1.0f);
-    op->SetAttr("transpose_X", true);
-    op->SetAttr("transpose_Y", true);
-  }
-  if (type == "matmul_v2") {
-    op->SetInput("Y", {inputs[1]});
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("trans_x", true);
-    op->SetAttr("trans_y", true);
-  }
-}
-
-ProgramDesc BuildProgramDesc(const std::string &op_name) {
-  ProgramDesc prog;
-  for (auto &v : std::initializer_list<std::string>(
-           {"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) {
-    auto *var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-  }
-
-  SetOp(&prog, op_name, {"a1", "a2"}, {"b"});
-  SetOp(&prog, "transpose2", {"b"}, {"c", "cx"});
-  SetOp(&prog, "reshape2", {"c"}, {"d", "dx"});
-  SetOp(&prog, "fc", {"d"}, {"e"});
-
-  return prog;
-}
-
-void MainTest(const ProgramDesc &prog, const std::string &op_name) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  int original_nodes_num = graph->Nodes().size();
-
-  auto pass =
-      PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-  EXPECT_EQ(original_nodes_num - 6, current_nodes_num);
-
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto *op = node->Op();
-      if (op->Type() == op_name) {
-        EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_reshape_Out"),
-                  std::vector<int>({4, 5, 6}));
-        EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_transpose_Out"),
-                  std::vector<int>({0, 2, 1, 3}));
-      }
-    }
-  }
-}
-
-TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) {
-  auto prog = BuildProgramDesc("matmul");
-  MainTest(prog, "matmul");
-}
-
-TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) {
-  auto prog = BuildProgramDesc("matmul_v2");
-  MainTest(prog, "matmul_v2");
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 2dd13573d98a054167db0a7686d106fb151af605..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void AddVarToScope(Scope* param_scope,
-                   const std::string& name,
-                   const DDim& dims) {
-  auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(phi::CPUPlace());
-}
-
-Scope* CreateParamScope() {
-  auto param_scope = new Scope();
-  AddVarToScope(param_scope, "w1", {768, 768});
-  AddVarToScope(param_scope, "bias1", {768});
-  AddVarToScope(param_scope, "w2", {768, 768});
-  AddVarToScope(param_scope, "bias2", {768});
-  return param_scope;
-}
-
-void TestMain(const std::string& op_name, bool with_xshapes) {
-  // inputs          operator          output
-  // -----------------------------------------------
-  //  a1,w1,bias1      fc          ->    b1
-  //  b1             reshape       ->    c1
-  //  c1            transpose      ->    d1
-  //  a2,w2,bias2      fc          ->    b2
-  //  b2             reshape       ->    c2
-  //  c2            transpose      ->    d2
-  // (d1, d2)        matmul(_v2)   ->    (...)
-  Layers layers;
-  auto* a1 = layers.data("a1", {-1, 128, 768});
-  auto* w1 = layers.data("w1", {768, 768}, true);
-  auto* bias1 = layers.data("bias1", {768}, true);
-  auto* b1 = layers.fc(a1, w1, bias1, 2);
-  b1->SetShape({-1, 128, 768});
-  auto* c1 = layers.reshape2(b1, {0, 0, 12, 64}, with_xshapes);
-  c1->SetShape({-1, 128, 12, 64});
-  auto* d1 = layers.transpose2(c1, {0, 2, 1, 3}, with_xshapes);
-  d1->SetShape({-1, 12, 128, 64});
-  auto* a2 = layers.data("a2", {-1, 128, 768});
-  auto* w2 = layers.data("w2", {768, 768}, true);
-  auto* bias2 = layers.data("bias2", {768}, true);
-  auto* b2 = layers.fc(a2, w2, bias2, 2);
-  b2->SetShape({-1, 128, 768});
-  auto* c2 = layers.reshape2(b2, {0, 0, 12, 64});
-  c2->SetShape({-1, 128, 12, 64});
-  auto* d2 = layers.transpose2(c2, {0, 2, 1, 3});
-  d2->SetShape({-1, 12, 128, 64});
-  if (op_name == "matmul_v2") {
-    layers.matmul_v2(d1, d2);
-  } else {
-    layers.matmul(d1, d2);
-  }
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  graph->Set("__param_scope__", CreateParamScope());
-
-  int num_reshape_nodes_before = GetNumOpNodes(graph, "reshape2");
-  int num_transpose_nodes_before = GetNumOpNodes(graph, "transpose2");
-  int total_nodes_before = graph->Nodes().size();
-  VLOG(3) << DebugString(graph);
-
-  auto pass =
-      PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2");
-  int num_transpose_nodes_after = GetNumOpNodes(graph, "transpose2");
-  int total_nodes_after = graph->Nodes().size();
-  VLOG(3) << DebugString(graph);
-
-  EXPECT_EQ(num_reshape_nodes_before, 2);
-  EXPECT_EQ(num_reshape_nodes_after, 0);
-  EXPECT_EQ(num_transpose_nodes_before, 2);
-  EXPECT_EQ(num_transpose_nodes_after, 0);
-  int removed = 8;  // 2* reshape, reshape_out, transpose, transpose_out
-  if (with_xshapes) removed += 2;  // transpose_xshape, reshape_xshape
-  EXPECT_EQ(total_nodes_before - removed, total_nodes_after);
-  auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op();
-
-  auto check = [&matmul_op_desc](std::string a) {
-    std::string shape_str = "fused_reshape_" + a;
-    auto shape = matmul_op_desc->GetAttrIfExists<std::vector<int>>(shape_str);
-    EXPECT_EQ(shape, (std::vector<int>{0, 0, 12, 64}));
-    std::string axis_str = "fused_transpose_" + a;
-    auto axis = matmul_op_desc->GetAttrIfExists<std::vector<int>>(axis_str);
-    EXPECT_EQ(axis, (std::vector<int>{0, 2, 1, 3}));
-  };
-  check("X");
-  check("Y");
-}
-
-TEST(ReshapeTransposeMatmulMkldnnFusePass,
-     both_matmul_inputs_reshape_transpose) {
-  TestMain("matmul", false);
-}
-
-TEST(ReshapeTransposeMatmulMkldnnFusePass,
-     both_matmul_inputs_reshape_transpose_one_with_xshapes) {
-  TestMain("matmul", true);
-}
-
-TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
-     both_matmulv2_inputs_reshape_transpose) {
-  TestMain("matmul_v2", false);
-}
-
-TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
-     both_matmulv2_inputs_reshape_transpose_one_with_xshapes) {
-  TestMain("matmul_v2", true);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
index e7e925a47797faf6aa2457ca78e62b8a6ee1bef2..73e6664f66f1e04a810e4ed58d13f5b7c05e528e 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -19,14 +19,27 @@
 #include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h"
 
 DECLARE_bool(fast_eager_deletion_mode);
+DECLARE_bool(new_executor_use_cuda_graph);
 
 namespace paddle {
 namespace framework {
 
 bool IsInterpretercoreFastGCEnabled() {
-  return memory::allocation::AllocatorFacade::Instance()
-             .IsStreamSafeCUDAAllocatorUsed() &&
-         FLAGS_fast_eager_deletion_mode;
+  // When using cuda graph, fast GC must be used. Because
+  // `EventQuery` method in event GC cannot be used in
+  // cuda graph.
+  PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
+                                .IsStreamSafeCUDAAllocatorUsed() == false &&
+                        FLAGS_new_executor_use_cuda_graph,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "When FLAGS_new_executor_use_cuda_graph is true, "
+                        "IsStreamSafeCUDAAllocatorUsed must be true, but "
+                        "got false."));
+  return (memory::allocation::AllocatorFacade::Instance()
+              .IsStreamSafeCUDAAllocatorUsed() &&
+          FLAGS_fast_eager_deletion_mode) ||
+         FLAGS_new_executor_use_cuda_graph;
 }
 
 InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 4f2a4f48b7f99749963eaa39160360b54a5620d0..63525330ea60debc6db363d96f6049153cd4550a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -31,6 +31,7 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/device_manager.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
@@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
+DECLARE_bool(new_executor_use_cuda_graph);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(sync_nccl_allreduce);
+#endif
 
 constexpr const char* kExceptionCaught = "ExceptionCaught";
 constexpr const char* kTaskCompletion = "TaskCompletion";
@@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
     }
     return lhs_prority > rhs_prority;
   };
+
+  PrepareForCUDAGraphCapture();
 }
 
 InterpreterCore::~InterpreterCore() {
@@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
   SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
 
   Prepare(feed_names, feed_tensors, true);
   interpreter::CostInfo cost_info;
@@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
   SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
 
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
@@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run(
   // return Fetch Tensors
   auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
   if (fetch_var) {
-    return std::move(*fetch_var->GetMutable<framework::FetchList>());
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
   } else {
     return {};
   }
@@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run(
 paddle::framework::FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names, bool need_fetch) {
   SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
 
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
@@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run(
       HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
   auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
   if (fetch_var && need_fetch) {
-    return std::move(*fetch_var->GetMutable<framework::FetchList>());
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
   } else {
     return {};
   }
@@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() {
   }
 }
 
+void InterpreterCore::PrepareForCUDAGraphCapture() {
+  if (!FLAGS_new_executor_use_cuda_graph) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(),
+      false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "when running the first batch."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
+  // which may cause error in cuda graph. This behavior is consistent with PE.
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  // All output vars of coalesce_tensor op should not be gc.
+  // If fused output var of coalesce_tensor is gc, it will cause accuracy
+  // problem. The specific reasons need to be analyzed.
+  for (auto& op_desc : block_.AllOps()) {
+    if (op_desc->Type() == kCoalesceTensor) {
+      for (auto& out_var_name : op_desc->OutputArgumentNames()) {
+        execution_config_.skip_gc_vars.insert(out_var_name);
+        VLOG(4) << "Insert Var(" << out_var_name << ") into skip_gc_vars.";
+      }
+    }
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
+void InterpreterCore::CheckCUDAGraphBeforeRun(
+    const std::vector<std::string>& feed_names) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        feed_names.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        FLAGS_new_executor_use_cuda_graph,
+        true,
+        platform::errors::InvalidArgument(
+            "You must turn on FLAGS_new_executor_use_cuda_graph to True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        place_,
+        platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+}
+
 void InterpreterCore::BuildOperatorDependences() {
   // analysis the dependences between ops, add next_instr_list to each instr,
   // and set the dependecy_count_
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 74ff5c563652ea486d552c4d1ecf2cbb363fa04d..53625c87938305c6a22909d70352d0cb1095b1d0 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -97,6 +97,10 @@ class InterpreterCore {
       const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
   void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
 
+  // cuda graph
+  void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
+  void PrepareForCUDAGraphCapture();
+
   // execution
   void RunImpl();
   void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5e8d0b1b87ae27a0dd40ac3e29a31cf149e14576..fe863381b570bba755d04e5c74e8bdd1b4630eef 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->Info().infer_shape_(&infer_shape_ctx);
 }
 
+template <typename T>
+bool HasSameTensorType(phi::TensorBase* phi_tensor, Variable* var) {
+  if (phi_tensor == nullptr && var == nullptr) {
+    return true;
+  } else if (phi_tensor != nullptr && var != nullptr) {
+    if (T::classof(phi_tensor) && var->IsType<T>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// TODO(YuanRisheng): We need collect all `need_prepare_phi_data_`
+// into this function.
+void OperatorWithKernel::CheckWhetherPreparePhiData(
+    const VariableNameMap& innames,
+    const VariableNameMap& outnames,
+    const Scope& scope) const {
+  if (run_phi_kernel_ && impl_ != nullptr) {
+    const auto& phi_kernel_context = impl_->getKernelContext();
+    size_t phi_tensor_index = 0;
+    // Check each tensor in KernelContext, if there is a tensor that has
+    // different type with variable. The PhiKernelContext need be reconstructed.
+    // We use kernel_signature_'s output to retrieve tensor. Because the tensor
+    // in phi_kernel_context stored in the order of kernel_signature_'s output.
+    if (phi_kernel_context->OutputsSize() >= phi_tensor_index ||
+        kernel_signature_ == nullptr) {
+      need_prepare_phi_data_ = true;
+      return;
+    }
+
+    const auto& phi_output_names = kernel_signature_->output_names;
+    for (auto& phi_output_name : phi_output_names) {
+      const auto& iter = outnames.find(phi_output_name);
+      if (iter != outnames.end()) {
+        for (auto& var_name : iter->second) {
+          auto var_output = scope.FindVar(var_name);
+          auto phi_output =
+              phi_kernel_context->MutableOutputAt<phi::TensorBase>(
+                  phi_tensor_index);
+          if (phi_output == nullptr) {
+            continue;
+          }
+          if (!(HasSameTensorType<phi::DenseTensor>(phi_output, var_output) ||
+                HasSameTensorType<phi::SparseCooTensor>(phi_output,
+                                                        var_output) ||
+                HasSameTensorType<framework::Strings>(phi_output,
+                                                      var_output))) {
+            need_prepare_phi_data_ = true;
+          }
+          phi_tensor_index++;
+        }
+      }
+    }
+  }
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       HasAttr(kAllKernelsMustComputeRuntimeShape))
     all_kernels_must_compute_runtime_shape_ = true;
   const Scope* cur_scope = &scope;
+  CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
   if (!enable_cache_runtime_context_) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
@@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                         "to the size of kernel attribute_defs (%d).",
                         attr_names.size(),
                         attr_defs.size()));
-
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto it = ctx.inputs.find(input_names[i]);
 
@@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } else if (var->IsType<framework::Vocab>()) {
         tensor_in = &(var->Get<framework::Vocab>());
         phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::FeedList>()) {
+        tensor_in = &(var->Get<framework::FeedList>());
+        phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
@@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
     phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done inputs";
-
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto it = ctx.outputs.find(output_names[i]);
     size_t start_idx =
@@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
           // Note: If the input LoDTensorArray size is 0, the output
           // LoDTensorArray is also 0
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::Strings>()) {
+          tensor_out = var->template GetMutable<framework::Strings>();
+          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<paddle::framework::RawTensor>()) {
           tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
@@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                                           i);
   }
   VLOG(4) << "Done outputs";
-
   for (size_t i = 0; i < attr_names.size(); ++i) {
     VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
             << attr_defs[i].type_index;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b4e0c94c20be2d189a03fe73d59bd46447acb1c8..955f30f3406190a5bd3bd2a77161f3b0a5c3336c 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return var->IsType<phi::SparseCooTensor>();
   }
 
+  bool IsSparseCooTensorOutput(const std::string& name) const override {
+    auto vars = ctx_.MultiOutputVar(name);
+    return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
+      return var->IsType<phi::SparseCooTensor>();
+    });
+  }
+
   bool IsSparseCsrTensorInput(const std::string& name) const override {
     const auto* var = ctx_.InputVar(name);
     return var->IsType<phi::SparseCsrTensor>();
@@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase {
                      RuntimeContext* ctx,
                      const phi::Place& place) const;
 
+  void CheckWhetherPreparePhiData(const VariableNameMap& innames,
+                                  const VariableNameMap& outnames,
+                                  const Scope& scope) const;
+
   void TransferInplaceVarsBack(const Scope& scope,
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 1d8a9819a815d90548ec73db05d51101ab196a07..6bb21c569b30289e9c9eeed0a69b27b222965037 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -60,6 +60,7 @@ if(WITH_TESTING)
     elementwise_add_op
     generated_op)
   set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES})
 
   cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS
               transform_desc)
diff --git a/paddle/fluid/framework/raw_tensor.h b/paddle/fluid/framework/raw_tensor.h
index dfee5acd14af0ca7829b501cb7230db8011f2df1..60ccd6a5bae3e468e3272044b35b5ed88174c187 100644
--- a/paddle/fluid/framework/raw_tensor.h
+++ b/paddle/fluid/framework/raw_tensor.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace framework {
 
 /// \brief Fluid Kernel and PHI Kernel will be unified in the future.
-/// So, we need a class in PHI that can represent the RAW type in Fluid.
-/// The RawTensor is for PHI Kernel that has RAW type arguments.
+/// So, we need a class in PHI that can represent the RawTensor type in Fluid.
+/// The RawTensor is for PHI Kernel that has RawTensor type arguments.
 class RawTensor : public phi::ExtendedTensor,
                   public phi::TypeInfoTraits<phi::TensorBase, RawTensor> {
  public:
@@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor,
   RawTensor& operator=(RawTensor&& other) = default;
 
   /// \brief Destroy the RawTensor and release exclusive resources.
-  virtual ~RawTensor() = default;
+  virtual ~RawTensor() {
+    if (!data_.empty()) {
+      data_deleter_();
+    }
+  }
 
  public:
   /// \brief Returns the name of the class for type traits.
   /// \return The name of the class.
   static const char* name() { return "RawTensor"; }
 
+  template <typename T>
+  T& Get() const {
+    PADDLE_ENFORCE_EQ(data_.empty(),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "The data in RawTensor is empty. Please set data "
+                          "before using it."));
+
+    try {
+      return *(paddle::any_cast<T*>(data_));
+    } catch (paddle::bad_any_cast&) {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Invalid data type error, expected %s, actual %s.",
+          typeid(T).name(),
+          data_type_.name()));
+    }
+  }
+
   template <typename T>
   T* GetMutable() {
     if (!data_.empty()) {
@@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor,
 
  private:
   paddle::any data_;
-  std::function<void(void)> data_deleter_;
+  std::function<void(void)> data_deleter_ = []() {};
   std::type_index data_type_ = std::type_index(typeid(void));
 };
 
diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h
index 4ac8d89981bee4d143e4452e255f22b7cc207716..9fd245ff91765893971558795dcd67d6e63f1533 100644
--- a/paddle/fluid/framework/string_array.h
+++ b/paddle/fluid/framework/string_array.h
@@ -25,6 +25,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't
+// recommend widely use it. Because faster_tokenizer_op may be deleted in the
+// future and this class will be deleted.
+
 class Vocab : public phi::ExtendedTensor,
               public phi::TypeInfoTraits<phi::TensorBase, Vocab> {
  public:
@@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor,
   std::unordered_map<std::wstring, std::int32_t> data_;
 };
 
+// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI
+// Kernel. It can be used when you define a non-tensor type that needs to be
+// stored in a vector as PHI kernel argument.
+
+template <typename T>
+class PhiVector : public phi::ExtendedTensor,
+                  public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
+ public:
+  PhiVector() = default;
+
+  explicit PhiVector(const std::vector<T>& init_data) : data_(init_data) {}
+
+  PhiVector(PhiVector&& other) = default;
+
+  PhiVector(const PhiVector& other) = default;
+
+  PhiVector& operator=(const PhiVector& other) = default;
+
+  PhiVector& operator=(const std::vector<T>& other) {
+    data_ = other;
+    return *this;
+  }
+
+  PhiVector& operator=(PhiVector&& other) = default;
+
+  /// \brief Destroy the PhiVector and release exclusive resources.
+  virtual ~PhiVector() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() {
+    return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
+  }
+
+  size_t size() const { return data_.size(); }
+
+  void resize(size_t size) { data_.resize(size); }
+
+  void clear() { data_.clear(); }
+
+  void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); }
+
+  const T& operator[](size_t index) const { return data_[index]; }
+
+  T& operator[](size_t index) { return data_[index]; }
+
+  T& at(size_t index) { return data_.at(index); }
+
+  const T& at(size_t index) const { return data_.at(index); }
+
+  typename std::vector<T>::iterator begin() { return data_.begin(); }
+
+  typename std::vector<T>::const_iterator begin() const {
+    return data_.begin();
+  }
+
+  typename std::vector<T>::iterator end() { return data_.end(); }
+
+  typename std::vector<T>::const_iterator end() const { return data_.end(); }
+
+ private:
+  std::vector<T> data_;
+};
+
 using String = std::string;
-using Strings = std::vector<std::string>;
+using Strings = PhiVector<std::string>;
 
 // Convert the std::string type to the std::string type.
 bool ConvertStrToWstr(const std::string& src, std::wstring* res);
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 1e6c110e86a30fd84f6d277cfb021213b9dfedcb..fab9d28abbac4eac750901ea7b2a504f9f33e8a3 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     Vocab,
     std::vector<int>,
     std::vector<float>,
+    std::vector<std::string>,
     RawTensor>;
 template <typename T>
 struct VarTypeTrait {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3a54c7b4ed2e0de110dcf2a359438fa258df48f4..f4c35b49a0d00760f78f3d22a1dab1ee0ffae010 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1655,7 +1655,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     auto custom_place = place_;
     auto paddleplace = static_cast<PaddlePlace>(
         static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+        phi::CustomRegisteredDeviceMap::Instance()
+            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
     res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
@@ -1710,7 +1711,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     auto custom_place = place_;
     auto paddleplace = static_cast<PaddlePlace>(
         static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+        phi::CustomRegisteredDeviceMap::Instance()
+            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
     res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 02d5f91d630ce8a2108e76ecfbceb7695bb18bd2..5d2357d362e990bcf1f8ae54e71148884cc4b19f 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME)
   cc_library(
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime)
+    DEPS onnxruntime phi_enforce)
 else()
   cc_library(
     zero_copy_tensor
     SRCS zero_copy_tensor.cc
     DEPS scope lod_tensor enforce)
-  cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
+  cc_library(
+    zero_copy_tensor_dummy
+    SRCS zero_copy_tensor_dummy.cc
+    DEPS phi_enforce)
 endif()
 
 cc_test(
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index e7cda8707c872471c7e54816652c24d765077302..59b44769ddd38b073947a4096b498a20f17e25d1 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) {
       var,
       paddle::platform::errors::PreconditionNotMet(
           "No tensor called [%s] in the runtime scope", name_));
-  paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
+  paddle::framework::Strings *tensor =
+      var->GetMutable<paddle::framework::Strings>();
   tensor->resize(shape);
 }
 
@@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     paddle::platform::CustomPlace custom_place(
-        phi::GetGlobalDeviceType(device_type_id), device_);
+        phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+            device_type_id),
+        device_);
     auto *t_data = tensor->mutable_data<T>(custom_place);
     auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
         pool.Get(custom_place));
@@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data,
 }
 
 void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
-  EAGER_GET_TENSOR(paddle_infer::Strings);
+  EAGER_GET_TENSOR(paddle::framework::Strings);
   PADDLE_ENFORCE_GE(tensor->size(),
                     0,
                     paddle::platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
index f7667c6df9eda16370bc707b73bc301596dd98d0..805813cbe153c93e760494f763eadb7847c56bb6 100644
--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput(
     const std::string& name) const {
   return false;
 }
+
+bool PluginArgumentMappingContext::IsSparseCooTensorOutput(
+    const std::string& name) const {
+  return false;
+}
+
 bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
     const std::string& name) const {
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
index 088e966a0cca742a9ce583aeb84d2c3ab21244a6..c0c30f3ac57b098da9ad9cabfc5a90ed55974d5e 100644
--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
@@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
 
   bool IsDenseTensorOutput(const std::string& name) const override;
 
+  bool IsSparseCooTensorOutput(const std::string& name) const override;
+
   bool IsSelectedRowsOutput(const std::string& name) const override;
 
   bool IsForInferShape() const override { return false; }
diff --git a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
index 75716a91f574f7c11cd331ab05fd62865894a7c8..d514fa0bb1af82abbf178586ccf77f47d44c6285 100644
--- a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
@@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) {
 
   EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
   EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
+  EXPECT_EQ(context.IsSparseCooTensorOutput("Out"), false);
   EXPECT_EQ(context.IsForInferShape(), false);
 }
 
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 557a9cf333a3b95caf60065c5a337e989493f61e..55529b58aeb5f1cb4efe6aedd9c582b5f174e4c3 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -26,6 +26,8 @@
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
+DECLARE_bool(use_shm_cache);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -111,20 +113,33 @@ void AllocateMemoryMap(
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
                                       int flags,
-                                      size_t size) {
+                                      size_t size,
+                                      int buffer_id) {
   int fd = -1;
   void *base_ptr = nullptr;
-  AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+  if (buffer_id == -1) {
+    AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+    VLOG(4) << "Create and mmap a new shm: " << filename;
+  } else {
+    base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
+    VLOG(4) << "Get a cached shm " << filename;
+  }
   void *aliged_base_ptr =
       static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
   return std::make_shared<RefcountedMemoryMapAllocation>(
-      aliged_base_ptr, size, filename, flags, fd);
+      aliged_base_ptr, size, filename, flags, fd, buffer_id);
 }
 
 RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
-    void *ptr, size_t size, std::string ipc_name, int fd, int flags)
+    void *ptr,
+    size_t size,
+    std::string ipc_name,
+    int fd,
+    int flags,
+    int buffer_id)
     : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
   // must reset base ptr first.
+  buffer_id_ = buffer_id;
   resetBaseptr();
   initializeRefercount();
 }
@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() {
 }
 
 void RefcountedMemoryMapAllocation::close() {
+  VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_;
   if (closed_) {
     return;
   }
   closed_ = true;
   void *data = map_ptr_;
   CountInfo *info = reinterpret_cast<CountInfo *>(data);
-  if (--info->refcount == 0) {
-    shm_unlink(ipc_name_.c_str());
-    VLOG(6) << "shm_unlink file: " << ipc_name_;
+  --info->refcount;
+  if (FLAGS_use_shm_cache && buffer_id_ != -1) {
+    return;
+  } else {
+    if (FLAGS_use_shm_cache &&
+        MemoryMapAllocationPool::Instance().BufferSize() <
+            static_cast<size_t>(
+                MemoryMapAllocationPool::Instance().MaxPoolSize())) {
+      MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo(
+          flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_));
+    } else {
+      if (info->refcount == 0 &&
+          shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) {
+        shm_unlink(ipc_name_.c_str());
+        VLOG(6) << "shm_unlink file: " << ipc_name_;
+      }
+
+      PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_),
+                        -1,
+                        platform::errors::Unavailable(
+                            "could not unmap the shared memory file: ",
+                            strerror(errno),
+                            " (",
+                            errno,
+                            ")"));
+    }
   }
-
-  PADDLE_ENFORCE_NE(
-      munmap(map_ptr_, map_size_),
-      -1,
-      platform::errors::Unavailable("could not unmap the shared memory file: ",
-                                    strerror(errno),
-                                    " (",
-                                    errno,
-                                    ")"));
 }
 
 MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() {
 
 MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }
 
+MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr;
+
+void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  memory_map_allocations_.push_back(memory_map);
+  VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_;
+}
+
+int MemoryMapAllocationPool::FindFromCache(const int &flag,
+                                           const size_t &data_size,
+                                           const std::string &file_name,
+                                           bool check_refcount) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) {
+    if (memory_map_allocations_.at(idx).flags_ == flag &&
+        memory_map_allocations_.at(idx).data_size_ == data_size) {
+      if (file_name == "" ||
+          memory_map_allocations_.at(idx).file_name_ == file_name) {
+        if (!check_refcount || reinterpret_cast<CountInfo *>(
+                                   memory_map_allocations_.at(idx).mmap_ptr_)
+                                       ->refcount == 0) {
+          VLOG(4) << "Match at: " << idx;
+          return idx;
+        }
+      }
+    }
+  }
+  return -1;
+}
+
+const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  return memory_map_allocations_.at(id);
+}
+
+void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
+  max_pool_size_ = size;
+  VLOG(4) << this << "Set max pool size is: " << max_pool_size_;
+}
+
+void MemoryMapAllocationPool::Clear() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto mmap : memory_map_allocations_) {
+    int rlt = shm_unlink(mmap.file_name_.c_str());
+    if (rlt == 0) {
+      VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
+    }
+    PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment),
+                      -1,
+                      platform::errors::Unavailable(
+                          "could not unmap the shared memory file: ",
+                          strerror(errno),
+                          " (",
+                          errno,
+                          ")"));
+  }
+  memory_map_allocations_.clear();
+}
+
+MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
index 3fc5d2d1891f245c0096d9a49b76121a2b13ce16..412e3a3545769dc09542b4fd921fc1d44bae2de5 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation {
 
 class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
  public:
-  RefcountedMemoryMapAllocation(
-      void *ptr, size_t size, std::string ipc_name, int flags, int fd);
+  RefcountedMemoryMapAllocation(void *ptr,
+                                size_t size,
+                                std::string ipc_name,
+                                int flags,
+                                int fd,
+                                int buffer_id = -1);
 
   void incref();
   int decref();
@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
   virtual ~RefcountedMemoryMapAllocation() { close(); }
 
  protected:
+  int buffer_id_ = -1;
   void initializeRefercount();
   void resetBaseptr();
 };
@@ -94,7 +99,8 @@ void AllocateMemoryMap(
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
                                       int flags,
-                                      size_t size);
+                                      size_t size,
+                                      int buffer_id = -1);
 
 class MemoryMapWriterAllocation : public Allocation {
  public:
@@ -153,6 +159,68 @@ class MemoryMapFdSet {
   std::mutex mtx_;
 };
 
+class MemoryMapInfo {
+ public:
+  explicit MemoryMapInfo(int flags,
+                         size_t data_size,
+                         std::string file_name,
+                         void *mmap_ptr)
+      : flags_(flags),
+        data_size_(data_size),
+        file_name_(file_name),
+        mmap_ptr_(mmap_ptr) {}
+
+  int flags_ = 0;
+  size_t data_size_ = 0;
+  std::string file_name_;
+  void *mmap_ptr_ = nullptr;
+};
+
+/* Note(zhangbo):
+MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in
+dataloader. The munmap(shm_mmap_ptr) instruction in
+RefcountedMemoryMapAllocation::close() function may block other threads of the
+process. Therefore, the logic of shm cache and reuse is designed: the shm
+created by the _share_filename process will be cached and reused according to
+the data_size of shm, thus eliminating the problem of munmap blocking other
+threads
+*/
+class MemoryMapAllocationPool {
+ public:
+  static MemoryMapAllocationPool &Instance() {
+    if (pool_ == nullptr) {
+      pool_ = new MemoryMapAllocationPool();
+    }
+    return *pool_;
+  }
+
+  void Insert(const MemoryMapInfo &memory_map);
+
+  int FindFromCache(const int &flag,
+                    const size_t &data_size,
+                    const std::string &file_name = "",
+                    bool check_refcount = true);
+
+  const MemoryMapInfo &GetById(int id);
+
+  size_t BufferSize() { return memory_map_allocations_.size(); }
+
+  void Clear();
+
+  void SetMaxPoolSize(const int &size);
+
+  int MaxPoolSize() { return max_pool_size_; }
+
+  ~MemoryMapAllocationPool();
+
+ private:
+  MemoryMapAllocationPool() = default;
+  static MemoryMapAllocationPool *pool_;
+  std::vector<MemoryMapInfo> memory_map_allocations_;
+  int max_pool_size_ = 0;
+  std::mutex mtx_;
+};
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index 78292738788f98c12558bdb3397b1367302adfad..3a97e275689ed032b3fdebd56d6b4be62c7b30af 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -44,6 +44,7 @@ if(WITH_TESTING)
     cinn_launch_context
     cinn_instruction_run_op
     cinn)
+  target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
                                                            "RUN_TYPE=CINN")
 
@@ -73,6 +74,7 @@ if(WITH_TESTING)
     cinn_launch_op
     cinn_instruction_run_op
     elementwise_add_op)
+  target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
   set_tests_properties(
     cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
                                             "${CINN_RUN_ENVIRONMENT}")
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index e076ead891419c00292958fe170a16053b327164..194dccb0e6ea03ebbf16af5ff02d93bc883589cd 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -11,6 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/raw_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
@@ -28,117 +30,128 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-// FeedVariableVisitor is to feed the variable data
-// according to data type (phi::DenseTensor or  Strings).
-class FeedVariableVisitor {
- public:
-  explicit FeedVariableVisitor(framework::Variable *out_var,
-                               const platform::Place &place)
-      : out_var_(out_var), place_(place) {}
-
-  void operator()(const phi::DenseTensor &in_tensor) const {
-    phi::DenseTensor *out_tensor = out_var_->GetMutable<phi::DenseTensor>();
-    if (platform::is_same_place(in_tensor.place(), place_)) {
-      out_tensor->ShareDataWith(in_tensor);
-#ifdef PADDLE_WITH_IPU
-    } else if (platform::is_ipu_place(place_)) {
-      // For ipu, both in_tensor and out_tensor are allocated on cpu,
-      // PopART will copy tensor from host automatically,
-      // no TensorCopy() is required here.
-      out_tensor->ShareDataWith(in_tensor);
-#endif
-    } else {
-      platform::DeviceContext *context =
-          platform::DeviceContextPool::Instance().Get(place_);
-      framework::TensorCopy(in_tensor, place_, *context, out_tensor);
-    }
-    out_tensor->set_lod(in_tensor.lod());
+const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
+                                               int col) {
+  PADDLE_ENFORCE_GE(col,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "Expected the column index (the attribute 'col' of "
+                        "operator 'Feed') of current feeding variable to be "
+                        "no less than 0. But received column index = %d.",
+                        col));
+  auto feed_list = static_cast<const paddle::framework::FeedList*>(&x);
+  PADDLE_ENFORCE_LT(
+      static_cast<size_t>(col),
+      feed_list->size(),
+      platform::errors::InvalidArgument(
+          "The column index of current feeding variable is expected to be "
+          "less than the length of feeding list. But received column index = "
+          "%d, the length of feeding list = %d",
+          col,
+          feed_list->size()));
+
+  return feed_list->at(static_cast<size_t>(col));
+}
+
+template <typename Context>
+void FeedDenseTensorKernel(const Context& dev_ctx,
+                           const phi::ExtendedTensor& x,
+                           int col,
+                           phi::DenseTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  const auto& in_tensor = paddle::get<phi::DenseTensor>(feed_item);
+  const auto& place = dev_ctx.GetPlace();
+  if (platform::is_same_place(in_tensor.place(), place)) {
+    out->ShareDataWith(in_tensor);
+  } else {
+    framework::TensorCopy(in_tensor, place, dev_ctx, out);
   }
 
-  void operator()(const framework::Strings &in_str) const {
-    framework::Strings *out_str = out_var_->GetMutable<framework::Strings>();
-    out_str->resize(in_str.size());
-    *out_str = in_str;
+  out->set_lod(in_tensor.lod());
+}
+
+template <typename Context>
+void FeedSparseCooTensorKernel(const Context& dev_ctx,
+                               const phi::ExtendedTensor& x,
+                               int col,
+                               phi::SparseCooTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  const auto& in_tensor = paddle::get<phi::SparseCooTensor>(feed_item);
+  const auto& place = dev_ctx.GetPlace();
+  if (platform::is_same_place(in_tensor.place(), place)) {
+    *out = in_tensor;
+  } else {
+    phi::DenseTensor indices, values;
+    framework::TensorCopy(in_tensor.indices(), place, dev_ctx, &indices);
+    framework::TensorCopy(in_tensor.values(), place, dev_ctx, &values);
+    out->SetMember(indices, values, in_tensor.meta());
   }
+}
+
+template <typename Context>
+void FeedStringsKernel(const Context& dev_ctx,
+                       const phi::ExtendedTensor& x,
+                       int col,
+                       phi::ExtendedTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      platform::errors::NotFound(
+          "Output cannot be found in scope for operator 'Feed'"));
+  const auto& feed_item = CheckAndGetFeedItem(x, col);
+  auto strs_out = static_cast<framework::Strings*>(out);
+  const auto& in_str = paddle::get<framework::Strings>(feed_item);
+  strs_out->resize(in_str.size());
+  *strs_out = in_str;
+}
+
+class FeedOp : public framework::OperatorWithKernel {
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void operator()(const phi::SparseCooTensor &in_tensor) const {
-    phi::SparseCooTensor *out_tensor =
-        out_var_->GetMutable<phi::SparseCooTensor>();
-    if (platform::is_same_place(in_tensor.place(), place_)) {
-      *out_tensor = in_tensor;
-    } else {
-      platform::DeviceContext *context =
-          platform::DeviceContextPool::Instance().Get(place_);
-
-      phi::DenseTensor indices, values;
-      framework::TensorCopy(in_tensor.indices(), place_, *context, &indices);
-      framework::TensorCopy(in_tensor.values(), place_, *context, &values);
-      out_tensor->SetMember(indices, values, in_tensor.meta());
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "feed");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "feed");
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
+      auto& x = x_var->Get<framework::FeedList>();
+      int col = ctx->Attrs().Get<int>("col");
+      auto& feed_item = x[col];
+      if (feed_item.index() == 0) {
+        const auto& feed_item = CheckAndGetFeedItem(x, col);
+        auto& feed_tensor = PADDLE_GET_CONST(phi::DenseTensor, feed_item);
+        ctx->SetOutputDim("Out", feed_tensor.dims());
+      } else if (feed_item.index() == 1) {
+        auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item);
+        framework::Variable* out_var =
+            PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
+        out_var->GetMutable<framework::Strings>()->resize(feed_str.size());
+      } else {
+        auto& feed_sparse_tensor =
+            PADDLE_GET_CONST(phi::SparseCooTensor, feed_item);
+        framework::Variable* out_var =
+            PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
+        out_var->GetMutable<phi::SparseCooTensor>()->set_meta(
+            feed_sparse_tensor.meta());
+        out_var->GetMutable<phi::SparseCooTensor>()->SetCoalesced(
+            feed_sparse_tensor.coalesced());
+        out_var->GetMutable<phi::SparseCooTensor>()->SetIndicesDict(
+            feed_sparse_tensor.GetIndicesDict());
+      }
     }
   }
 
- private:
-  framework::Variable *out_var_;
-  const platform::Place &place_;
-};
-
-class FeedOp : public framework::OperatorBase {
- public:
-  FeedOp(const std::string &type,
-         const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    OP_INOUT_CHECK(HasInputs("X"), "Input", "X", "Feed");
-    OP_INOUT_CHECK(HasOutputs("Out"), "Output", "Out", "Feed");
-
-    auto feed_var_name = Input("X");
-    auto *feed_var = scope.FindVar(feed_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        feed_var,
-        platform::errors::NotFound(
-            "Input varibale(%s) cannot be found in scope for operator 'Feed'.",
-            feed_var_name));
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Output variable(%s) cannot be found in scope for operator 'Feed'",
-            out_name));
-
-    auto col = Attr<int>("col");
-    PADDLE_ENFORCE_GE(col,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Expected the column index (the attribute 'col' of "
-                          "operator 'Feed') of current feeding variable to be "
-                          "no less than 0. But received column index = %d.",
-                          col));
-
-    VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
-            << " column to variable " << out_name;
-
-    auto &feed_list = feed_var->Get<framework::FeedList>();
-    PADDLE_ENFORCE_LT(
-        static_cast<size_t>(col),
-        feed_list.size(),
-        platform::errors::InvalidArgument(
-            "The column index of current feeding variable is expected to be "
-            "less than the length of feeding list. But received column index = "
-            "%d, the length of feeding list = %d",
-            col,
-            feed_list.size()));
-
-    auto &feed_item = feed_list.at(static_cast<size_t>(col));
-
-    FeedVariableVisitor visitor(out_var, place);
-    paddle::visit(visitor, feed_item);
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
   }
 };
 
@@ -164,9 +177,152 @@ It should not be configured by users directly.
 }  // namespace operators
 }  // namespace paddle
 
+// TODO(YuanRisheng): Maybe we need design a new registry macro for
+// registering device independent kernels.
+
 REGISTER_OPERATOR(
     feed,
     paddle::operators::FeedOp,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::FeedOpInfoMaker);
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    CPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CPUContext>,
+    ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_MKLDNN)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    OneDNN,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::OneDNNContext>,
+    ALL_DTYPE) {}
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    GPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::GPUContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_XPU)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    XPU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::XPUContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_ASCEND_CL)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    npu,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+#elif defined(PADDLE_WITH_MLU)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    CustomMLU,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_sparse_coo_tensor,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    custom_cpu,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+#endif
diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc
index 6d1089ecf72a44850cfdd9e7cfd666fc15b5e7de..4c23020413ee5fbabbec88ce81439ce821df4008 100644
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel {
   }
 };
 
+class CumGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cumsum");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input",
+                   "Out@GRAD",
+                   "cumsum");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
+
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("cumsum");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
+    grad_op->SetType("cumsum_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetAttrMap(this->Attrs());
     grad_op->SetAttr("reverse",
-                     !PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
+                     PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
   }
 };
 
@@ -153,6 +175,7 @@ using CPU = phi::CPUContext;
 DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
                             CumsumInferShapeFunctor,
                             PD_INFER_META(phi::CumScalarAxisInferMeta));
+
 DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp,
                             LogcumsumexpInferShapeFunctor,
                             PD_INFER_META(phi::CumInferMeta));
@@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp,
                   ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
                   LogcumsumexpInferShapeFunctor);
 REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
+REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp);
 
 REGISTER_OP_VERSION(cumsum).AddCheckpoint(
     R"ROC(
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
deleted file mode 100644
index 62cecbd36ae47b33eb77e3397dc13b5c6eda500e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/determinant_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SlogDeterminantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
-};
-
-class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "(Tensor) The input tensor of SlogDeterminant.");
-    AddOutput("Out",
-              "(Tensor) The output tensor containing the sign of the"
-              "determinant and the natural logarithm"
-              "of the absolute value of determinant,");
-
-    AddComment(R"DOC(
-SlogDeterminant Operator.)DOC");
-  }
-};
-
-class SlogDeterminantGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Input"), "Input", "Input", "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SlogDeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")),
-                   "Output",
-                   framework::GradVarName("Input"),
-                   "SlogDeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("slogdeterminant_grad");
-    grad_op->SetInput("Input", this->Input("Input"));
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("Input"),
-                       this->InputGrad("Input"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
-                                    "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant,
-                            SlogDeterminantInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-REGISTER_OPERATOR(slogdeterminant,
-                  ops::SlogDeterminantOp,
-                  ops::SlogDeterminantOpMaker,
-                  ops::SlogDeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>,
-                  SlogDeterminantInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant_grad,
-                            SlogDeterminantGradInferShapeFunctor,
-                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
-REGISTER_OPERATOR(slogdeterminant_grad,
-                  ops::SlogDeterminantGradOp,
-                  SlogDeterminantGradInferShapeFunctor)  // reuse det grad op
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 5048a40dddea4a0669291108b32e0441898f53d2..4052f3e09e0cc2f29271bd0a9977340d39af4700 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+class ElementwiseMulGradCompositeOpMaker
+    : public prim::GradCompositeOpMakerBase {
+  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+
+ public:
+  void Apply() override {
+    auto x = this->GetSingleForwardInput("X");
+    auto y = this->GetSingleForwardInput("Y");
+    auto out_grad = this->GetSingleOutputGrad("Out");
+    auto x_grad = this->GetSingleInputGrad("X");
+    auto x_grad_p = this->GetOutputPtr(&x_grad);
+    auto x_grad_name = this->GetOutputName(x_grad);
+    auto y_grad = this->GetSingleInputGrad("Y");
+    auto y_grad_p = this->GetOutputPtr(&y_grad);
+    auto y_grad_name = this->GetOutputName(y_grad);
+    prim::multiply_grad<prim::DescTensor>(
+        x,
+        y,
+        out_grad,
+        static_cast<int>(this->Attr<int>("axis")),
+        x_grad_p,
+        y_grad_p);
+    VLOG(3) << "Runing mul_grad composite func";
+    this->RecoverOutputName(x_grad, x_grad_name);
+    this->RecoverOutputName(y_grad, y_grad_name);
+  }
+};
+
 template <typename T>
 class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul,
                   ops::ElementwiseMulOpMaker,
                   ops::ElementwiseOpInferVarType,
                   ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>,
+                  ops::ElementwiseMulGradCompositeOpMaker);
 REGISTER_OPERATOR(
     elementwise_mul_grad,
     ops::ElementwiseOpGrad,
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index cbd322f38767e9e3598a90ae28c2cd172c41552d..9a867c040fcb8ea3a238ce340ab2f5234a7424d1 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -20,6 +20,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase {
+  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+
+ public:
+  void Apply() override {
+    auto x = this->GetSingleForwardInput("X");
+    auto out_grad = this->GetSingleOutputGrad("Out");
+    auto x_grad = this->GetSingleInputGrad("X");
+    auto x_grad_p = this->GetOutputPtr(&x_grad);
+    auto x_grad_name = this->GetOutputName(x_grad);
+    auto shape = this->Attr<std::vector<int>>("shape");
+    prim::expand_grad<prim::DescTensor>(
+        x, out_grad, paddle::experimental::IntArray(shape), x_grad_p);
+    VLOG(3) << "Runing expand_v2 composite func";
+    this->RecoverOutputName(x_grad, x_grad_name);
+  }
+};
+
 template <typename T>
 class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -223,6 +244,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand_v2,
                   ops::ExpandV2Op,
                   ops::ExpandV2OpMaker,
+                  ops::ExpandV2GradCompositeOpMaker,
                   ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>,
                   ExpandInferShapeFunctor);
diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py
index c16be45f8f1ecda325d962f511c9750d62e61f27..e4bb7041016d21f36764fc7cce0674c56f5f1b3d 100644
--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
             op_item['no_need_buffer'] = get_param_list_alias(
                 op_item['no_need_buffer'], args_map
             )
+        if 'data_transform' in op_item and op_item['data_transform']:
+            data_trans_item = op_item['data_transform']
+            if 'skip_transform' in data_trans_item:
+                data_trans_item['skip_transform'] = get_param_list_alias(
+                    data_trans_item['skip_transform'], args_map
+                )
+            if 'support_trans_dtype' in data_trans_item:
+                data_trans_item['support_trans_dtype'] = get_param_list_alias(
+                    data_trans_item['support_trans_dtype'], args_map
+                )
 
         process_scalar(op_item, scalar_configs)
         process_int_array(op_item, int_array_configs)
diff --git a/paddle/fluid/operators/generator/parse_utils.py b/paddle/fluid/operators/generator/parse_utils.py
index 7d153ff868eeb8285df9c2ea5b35930d7d8f3859..d5a58a2a94a0ef67ca643c846c577b971ede4a49 100644
--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
     else:
         no_buffer_args = None
 
-    # TODO(chenfeiyu): data_transform
+    # add data_transform tag for every input.
+    # the format is {data_transform : {skip_transform : [x, z], support_trans_dtype : y}}
+    for input in inputs:
+        input["data_transform"] = {}
+    if "data_transform" in op_entry:
+        skip_trans_args = []
+        support_trans_args = []
+        data_trans = op_entry["data_transform"]
+        if "skip_transform" in data_trans:
+            skip_trans_args = parse_plain_list(data_trans["skip_transform"])
+            for name in skip_trans_args:
+                assert (
+                    name in input_names
+                ), f"{op_name} has an skip_transform input: '{name}' which is not an input."
+            data_trans["skip_transform"] = skip_trans_args
+        if "support_trans_dtype" in data_trans:
+            support_trans_args = parse_plain_list(
+                data_trans["support_trans_dtype"]
+            )
+            for name in support_trans_args:
+                assert (
+                    name in input_names
+                ), f"{op_name} has an support_trans_dtype input: '{name}' which is not an input."
+            data_trans["support_trans_dtype"] = support_trans_args
+        for input in inputs:
+            if input["name"] in skip_trans_args:
+                input["data_transform"]["skip_trans_args"] = True
+            else:
+                input["data_transform"]["skip_trans_args"] = False
+            if input["name"] in support_trans_args:
+                input["data_transform"]["support_trans_dtype"] = True
+            else:
+                input["data_transform"]["support_trans_dtype"] = False
+    else:
+        data_trans = None
 
     op = {
         "name": op_name,
@@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
         "attrs": attrs,
         "outputs": outputs,
         "no_need_buffer": no_buffer_args,
+        "data_transform": data_trans,
     }
 
     # invokes another op ?
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 37b5075235a83d7ce09f0c6fa0ebcf0463fc3cfb..000e56453d934f248ab6e427c722b997bb1d0032 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -117,6 +117,15 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}
 
 {# --------------------------------------- name mapping ---------------------------------------------- #}
 {% macro name_map(op) %}
+/*
+******************************************************************
+NOTE: The following codes are for 'get_compat_kernel_signature.py'
+All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
+
+{{op | cartesian_prod_mapping}}
+******************************************************************
+*/
+
 KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
   {% set kernel_args = op["kernel"]["param"] %}
   {{get_input_list(op["inputs"], kernel_args)}};
@@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   return sig;
   {%endif%}
 }
-
-/*
-******************************************************************
-NOTE: The following codes are for 'get_compat_kernel_signature.py'
-All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
-
-{{op | cartesian_prod_mapping}}
-******************************************************************
-*/
 {% endmacro %}
 
 {% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #}
@@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
 {%- endmacro %}
 
 {% macro sparse_op_name_map(op) %}
+/*
+******************************************************************
+NOTE: The following codes are for 'get_compat_kernel_signature.py'
+All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
+
+{{op | cartesian_prod_mapping}}
+******************************************************************
+*/
+
 KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
   {% set kernel_args = op["kernel"]["param"] %}
   {{get_input_list(op["inputs"], kernel_args)}};
@@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs));
   return sig;
 }
-
-/*
-******************************************************************
-NOTE: The following codes are for 'get_compat_kernel_signature.py'
-All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
-
-{{op | cartesian_prod_mapping}}
-******************************************************************
-*/
 {% endmacro %}
 
 {% macro register_base_kernel_name(op) %}
@@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType(
 }
 {% endmacro %}
 
+{% macro get_kernel_for_var(op) %} {# only for data_transform #}
+{% set skip_args = op["data_transform"]["skip_transform"] %}
+{% set var_name = "var_name" %}
+{% set skip_args_len = skip_args | length %}
+phi::KernelKey GetKernelTypeForVar(
+    const std::string& {{var_name}},
+    const phi::DenseTensor& tensor,
+    const phi::KernelKey& expected_kernel_type) const override {
+    
+      if (
+        {%- for skip_arg in skip_args -%}
+          var_name == "{{ skip_arg }}"
+          {%- if skip_args_len != 1 and loop.index != skip_args_len %} || {% endif -%}
+        {%- endfor -%}
+      ){
+        return phi::KernelKey(phi::Backend::ALL_BACKEND,
+                            expected_kernel_type.layout(),
+                            expected_kernel_type.dtype());
+      }
+      else{
+          return phi::KernelKey(
+            tensor.place(), tensor.layout(), expected_kernel_type.dtype());
+      }
+    }
+{% endmacro %}
+
 {# --------------------------------------- operator  ---------------------------------------------- #}
 {% macro operator(op) %}
 class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
@@ -293,9 +319,17 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne
   {% set kernel = op["kernel"] %}
   {% if kernel["data_type"] is not none %}
  protected:
-  {% filter indent(2, True)%}
+    {% filter indent(2, True)%}
 {{get_expected_kernel(op)}}
-  {% endfilter %}
+    {% endfilter %}
+    {%- if "data_transform" in op and op["data_transform"] is not none -%}
+      {%- if "skip_transform" in op["data_transform"] -%}
+        {% filter indent(2, True) %}
+{{get_kernel_for_var(op)}}
+        {% endfilter %}
+      {%- endif %}
+    {%- endif -%}
+{# TODO(lizhiyu): add the 'support_trans_dtype' #}
   {% endif %}
 };
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
deleted file mode 100644
index afdbaf0ca7729e6fff508127e1f0bdd77e383311..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/ternary.h"
-
-namespace paddle {
-namespace operators {
-
-class GraphSendRecvOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendRecvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor with data type float32, float64, int32, int64.");
-    AddInput("Src_index", "The source index tensor.");
-    AddInput("Dst_index", "The destination index tensor.");
-    AddInput("Out_size",
-             "(Tensor<int>, optional). The 0th dimension of the output."
-             "It has a higher priority than Attr(out_size).")
-        .AsDispensable();
-    AddOutput("Out", "Output tensor of graph_send_recv op.");
-    AddOutput("Dst_count",
-              "Count tensor of Dst_index, mainly for MEAN reduce_op.")
-        .AsIntermediate();
-    AddAttr<std::string>("reduce_op",
-                         "(string, default 'SUM')"
-                         "Define different pool types to receive the result "
-                         "tensors of Dst_index.")
-        .SetDefault("SUM")
-        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    AddAttr<std::vector<int64_t>>(
-        "out_size",
-        "(vector<int64_t>, default {0})"
-        "Define the first dimension of Output tensor."
-        "If set default {0}, then the shape of Out is the same with X.")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Graph Learning Send_Recv combine operator.
-
-$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
-
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
-intermediate memory consumption in the process of message passing.
-Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
-and then use `dst_index` to update the corresponding position of output tensor in different
-pooling types, like sum, mean, max, or min.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("graph_send_recv_grad");
-    op->SetInput("Src_index", this->Input("Src_index"));
-    op->SetInput("Dst_index", this->Input("Dst_index"));
-    op->SetInput("X", this->Input("X"));
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
-      op->SetInput("Dst_count", this->Output("Dst_count"));
-    }
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
-        PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
-      op->SetInput("Out", this->Output("Out"));
-    }
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv,
-                            GraphSendRecvInferShapeFunctor,
-                            PD_INFER_META(phi::SendURecvInferMeta));
-REGISTER_OPERATOR(graph_send_recv,
-                  ops::GraphSendRecvOP,
-                  ops::GraphSendRecvOpMaker,
-                  ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
-                  GraphSendRecvInferShapeFunctor);
-REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc
deleted file mode 100644
index 2a252bcf70368cd53bf2b3f597fb281f5e52f148..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_ue_recv_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class GraphSendUERecvOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendUERecvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-    auto y_dims = ctx->GetInputDim("Y");
-    ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor with data type float32, float64, int32, int64.");
-    AddInput("Y",
-             "The input edge weight tensor, data type should be same with X");
-    AddInput("Src_index", "The source index tensor.");
-    AddInput("Dst_index", "The destination index tensor.");
-    AddInput("Out_size",
-             "(Tensor<int>, optional). The 0th dimension of the output."
-             "It has a higher priority than Attr(out_size).")
-        .AsDispensable();
-    AddOutput("Out", "Output tensor of graph_send_ue_recv op.");
-    AddOutput("Dst_count",
-              "Count tensor of Dst_index, mainly for MEAN reduce_op.")
-        .AsIntermediate();
-    AddAttr<std::string>("message_op",
-                         "(string, default 'ADD')"
-                         "Define differenct computation types between X and E.")
-        .SetDefault("ADD")
-        .InEnum({"ADD", "MUL"});
-    AddAttr<std::string>("reduce_op",
-                         "(string, default 'SUM')"
-                         "Define different pool types to receive the result "
-                         "tensors of Dst_index.")
-        .SetDefault("SUM")
-        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    AddAttr<std::vector<int64_t>>(
-        "out_size",
-        "(vector<int64_t>, default {0})"
-        "Define the first dimension of Output tensor."
-        "If set default {0}, then the shape of Out is the same with X.")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Graph Learning Send_UE_Recv combine operator.
-
-$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$
-
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
-intermediate memory consumption in the process of message passing.
-
-Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
-Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
-and get the computation result. Then, use `dst_index` to update the corresponding position of output
-tensor in different pooling types, like sum, mean, max, or min.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("graph_send_ue_recv_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput("Src_index", this->Input("Src_index"));
-    op->SetInput("Dst_index", this->Input("Dst_index"));
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
-      op->SetInput("Dst_count", this->Output("Dst_count"));
-    }
-
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
-        PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
-      op->SetInput("Out", this->Output("Out"));
-    }
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv,
-                            GraphSendUERecvInferShapeFunctor,
-                            PD_INFER_META(phi::SendUERecvInferMeta));
-REGISTER_OPERATOR(graph_send_ue_recv,
-                  ops::GraphSendUERecvOP,
-                  ops::GraphSendUERecvOpMaker,
-                  ops::GraphSendUERecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendUERecvGradOpMaker<paddle::imperative::OpBase>,
-                  GraphSendUERecvInferShapeFunctor);
-REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index afef765c6ff71ce9f1a97e915d4f933558c138d2..2b337887faa3f8fe2dde678fa6caec12256adeb3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase {
 
     // get output orginal name
     std::string x_grad_name = this->GetOutputName(x_grad_t);
-
+    VLOG(3) << "Runing sum_grad composite func";
     // call composite backward func
     prim::sum_grad<prim::DescTensor>(
         x, out_grad, axis, keep_dim, reduce_all, x_grad);
diff --git a/paddle/fluid/operators/select_output_op.cc b/paddle/fluid/operators/select_output_op.cc
index f57933bab0c0b8b18334bc6e9971e663275c3a3b..e70c59b20a64f02ae2d15787a56873c5d7c7f9ed 100644
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput");
     OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput");
-    OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput");
+    OP_INOUT_CHECK(
+        context->HasOutputs("Out", true), "Output", "Out", "SelectOutput");
   }
 };
 
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
deleted file mode 100644
index 695807a4c3c6e529c4b8222ef410253ca69b2d09..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SizeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto dtype = framework::proto::VarType::FP32;  // dtype is not important
-    return phi::KernelKey(dtype, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                          expected_kernel_type.layout(),
-                          expected_kernel_type.dtype());
-  }
-};
-
-class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "The input tensor.");
-    AddOutput("Out",
-              "The returned tensor, the data type "
-              "is int64_t, will be on the same device with the input Tensor.");
-    AddComment(R"DOC(
-Size Operator.
-
-Return the number of elements in the input.
-)DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(size,
-                            SizeInferShapeFunctor,
-                            PD_INFER_META(phi::NumelInferMeta));
-REGISTER_OPERATOR(
-    size,
-    ops::SizeOp,
-    ops::SizeOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    SizeInferShapeFunctor,
-    ops::SizeOpNoNeedBufferVarInferer);
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 35128b0085687e8b8e9f5d27d2cd03c364d2928e..97bdd2784e9ca5efe05398c0d244aacd80e25473 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -407,8 +407,8 @@ int BertTokenizer::Encode(
 
 void BertTokenizer::BatchEncode(
     vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
-    const vector<string>& batch_text,
-    const vector<string>& batch_text_pair /* = vector<string>() */,
+    const framework::Strings& batch_text,
+    const framework::Strings& batch_text_pair /* = vector<string>() */,
     bool is_split_into_words /* = false */,
     const size_t max_seq_len /* = 0 */,
     bool pad_to_max_seq_len /* = false */) const {
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
index 0c9819025241949ffcdd5e751ca2d74244ddbcf0..789d54852c27e6acea207a177826754a9383ca0b 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -100,8 +100,8 @@ class BertTokenizer {
              bool pad_to_max_seq_len = false) const;
   void BatchEncode(
       vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
-      const vector<string>& batch_text,
-      const vector<string>& batch_text_pair = vector<string>(),
+      const framework::Strings& batch_text,
+      const framework::Strings& batch_text_pair = framework::Strings(),
       bool is_split_into_words = false,
       const size_t max_seq_len = 0,
       bool pad_to_max_seq_len = false) const;
@@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
     } else {
       tokenizer.BatchEncode(&batch_encode_inputs,
                             *text,
-                            vector<string>(),
+                            framework::Strings(),
                             is_split_into_words,
                             max_seq_len,
                             pad_to_max_seq_len);
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 7a5acb762eb83bbc52254cc2427938a1c8f0ba39..2f14a23168533cfdf34072b30a26b186d039d2c1 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/all_context.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
+DECLARE_bool(new_executor_use_cuda_graph);
 
 namespace paddle {
 namespace platform {
@@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place,
   auto stream = dev_ctx->stream();
   CUDAGraph::BeginCapture(place, stream, mode);
 
-  auto old_value = FLAGS_use_stream_safe_cuda_allocator;
+  // When using cuda graph in new executor, fast GC must be used.
+  // FLAGS_use_stream_safe_cuda_allocator should be true.
+  auto old_value = FLAGS_use_stream_safe_cuda_allocator &&
+                   !FLAGS_new_executor_use_cuda_graph;
   if (old_value) {
     FLAGS_use_stream_safe_cuda_allocator = false;
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index cebb36cbc6462ba05c484d84b843b3377a2af149..a49d9013fb6d0c50fdddc09b9da104355f37b4b0 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -20,6 +20,7 @@ namespace platform {
 void CudaProfilerInit(const std::string& output_file,
                       const std::string& output_mode,
                       const std::string& config_file) {
+#if CUDA_VERSION < 11000
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
                  platform::errors::InvalidArgument(
                      "Unsupported cuda profiler output mode, expect `kvp` or "
@@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file,
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+#endif
 }
 
 void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2c90acd6100980ec40d42393c8b8af5472b1f510..94236fcff1a4381bde7a30b136d415bee81e43e2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext {
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
   xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
+  void CreateStream() { XPUContext::CreateStream(); }
 };
 
 template <>
diff --git a/paddle/fluid/prim/api/.gitignore b/paddle/fluid/prim/api/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..377e800f00a0e08893961c1910c9b479e3143181
--- /dev/null
+++ b/paddle/fluid/prim/api/.gitignore
@@ -0,0 +1,3 @@
+generated/prim_api/eager_prim_api.cc
+generated/prim_api/tmp_eager_prim_api.cc
+generated/prim_api/*.h
diff --git a/paddle/fluid/prim/api/CMakeLists.txt b/paddle/fluid/prim/api/CMakeLists.txt
index 534ddec6b5c3cbd2e04a15c23d433444a3208b3c..436cecc32582b39cfe08b2a06f9d4dba55387f50 100644
--- a/paddle/fluid/prim/api/CMakeLists.txt
+++ b/paddle/fluid/prim/api/CMakeLists.txt
@@ -1,4 +1,7 @@
+add_subdirectory(auto_code_generated)
 add_subdirectory(manual)
+add_subdirectory(generated)
+
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     prim_api
diff --git a/paddle/fluid/prim/api/all.h b/paddle/fluid/prim/api/all.h
index 308eb91b4f11797c6e9a826b9d290eab951b7cf0..2996d2aa2657c8b8c09cfabd30daa7c2adf707b6 100644
--- a/paddle/fluid/prim/api/all.h
+++ b/paddle/fluid/prim/api/all.h
@@ -13,6 +13,6 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/utils/utils.h"
diff --git a/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e36af681bbd89589d58e5a7003beacb83ff08c24
--- /dev/null
+++ b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
@@ -0,0 +1,38 @@
+set(api_yaml_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml"
+)
+set(legacy_api_yaml_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml"
+)
+set(tmp_eager_prim_api_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc"
+)
+set(tmp_prim_api_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h"
+)
+set(eager_prim_api_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc"
+)
+set(prim_api_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h")
+set(prim_api_gen_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py)
+
+message("prim api Code gen")
+execute_process(
+  WORKING_DIRECTORY
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${prim_api_gen_file} --api_yaml_path
+    ${legacy_api_yaml_path} ${api_yaml_path} --prim_api_header_path
+    ${tmp_prim_api_h_path} --eager_prim_api_source_path
+    ${tmp_eager_prim_api_cc_path}
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "prim api genrate failed, exiting.")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                        ${tmp_prim_api_h_path} ${prim_api_h_path})
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                        ${tmp_eager_prim_api_cc_path} ${eager_prim_api_cc_path})
+message("copy tmp_xxx_prim_api to xxx_prim_api")
diff --git a/paddle/fluid/prim/api/auto_code_generated/prim_base.py b/paddle/fluid/prim/api/auto_code_generated/prim_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1ad94a7c3e40d66f402e4daa84fa76cbd7e728f
--- /dev/null
+++ b/paddle/fluid/prim/api/auto_code_generated/prim_base.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# prim api list
+white_ops_list = [
+    "pow",
+    "scale",
+    "multiply",
+    "unsqueeze",
+    "expand",
+    "full",
+    "reshape",
+    "divide",
+    "sum",
+    "exp",
+]
+
+inplace_out_type_map = {
+    "Tensor": "Tensor&",
+    "std::vector<Tensor>": "std::vector<Tensor>&",
+}
+
+inplace_optional_out_type_map = {
+    "Tensor": "paddle::optional<Tensor>&",
+    "std::vector<Tensor>": "paddle::optional<std::vector<Tensor>>&",
+}
+
+
+class BaseAPI:
+    def __init__(self, api_item_yaml):
+        # self.api = api_item_yaml['op']
+        self.api = api_item_yaml['name']
+
+        self.is_prim_api = False
+        if api_item_yaml['name'] in white_ops_list:
+            self.is_prim_api = True
+
+        #######################################
+        # inputs:
+        #     names : [], list of input names
+        #     input_info : {input_name : type}
+        # attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}
+        # outputs:
+        #     names : [], list of output names
+        #     types : [], list of output types
+        #     out_size_expr : [], expression for getting size of vector<Tensor>
+        ########################################
+        if self.is_prim_api:
+            (
+                self.inputs,
+                self.attrs,
+                self.outputs,
+                self.optional_vars,
+            ) = self.parse_args(self.api, api_item_yaml)
+
+            self.inplace_map = api_item_yaml['inplace']
+
+    def get_api_func_name(self):
+        return self.api
+
+    # def is_inplace(self):
+    #     if self.inplace_map
+    #         return True
+    #     return False
+
+    def get_input_tensor_args(self, inplace_flag=False):
+        input_args = []
+        inplace_type_map = {
+            "const Tensor&": "Tensor&",
+            "const paddle::optional<Tensor>&": "paddle::optional<Tensor>&",
+            "const std::vector<Tensor>&": "std::vector<Tensor>&",
+            "const paddle::optional<std::vector<Tensor>>&": "paddle::optional<std::vector<Tensor>>&",
+        }
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(
+                    inplace_type_map[self.inputs['input_info'][name]]
+                    + ' '
+                    + name
+                )
+            else:
+                input_args.append(self.inputs['input_info'][name] + ' ' + name)
+        return input_args
+
+    def get_declare_args(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            declare_args.append(
+                self.attrs['attr_info'][name][0] + ' ' + name + default_value
+            )
+
+        return ", ".join(declare_args)
+
+    def get_declare_args_nodefault(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
+
+        return ", ".join(declare_args)
+
+    def get_return_type(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                if self.inplace_map[out_name] in self.optional_vars:
+                    out_type_list.append(
+                        inplace_optional_out_type_map[out_type]
+                    )
+                else:
+                    out_type_list.append(inplace_out_type_map[out_type])
+            else:
+                out_type_list.append(out_type)
+        if len(out_type_list) == 1:
+            return out_type_list[0]
+        else:
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
+
+    def parse_args(self, api_name, api_item_yaml):
+        optional_vars = []
+        for input_dict in api_item_yaml['inputs']:
+            if input_dict['optional']:
+                optional_vars.append(input_dict['name'])
+
+        inputs, attrs = self.parse_input_and_attr(
+            api_item_yaml['inputs'], api_item_yaml['attrs']
+        )
+
+        output_type_list, output_names, out_size_expr = self.parse_output(
+            api_item_yaml['outputs']
+        )
+        return (
+            inputs,
+            attrs,
+            {
+                'names': output_names,
+                'types': output_type_list,
+                'out_size_expr': out_size_expr,
+            },
+            optional_vars,
+        )
+
+    def parse_input_and_attr(self, inputs_list, attrs_list):
+        input_types_map = {
+            'Tensor': 'const Tensor&',
+            'Tensor[]': 'const std::vector<Tensor>&',
+        }
+        attr_types_map = {
+            'IntArray': 'const IntArray&',
+            'Scalar': 'const Scalar&',
+            'Scalar(int)': 'const Scalar&',
+            'Scalar(int64_t)': 'const Scalar&',
+            'Scalar(float)': 'const Scalar&',
+            'Scalar(dobule)': 'const Scalar&',
+            'Scalar[]': 'const std::vector<phi::Scalar>&',
+            'int': 'int',
+            'int32_t': 'int32_t',
+            'int64_t': 'int64_t',
+            'long': 'long',
+            'size_t': 'size_t',
+            'float': 'float',
+            'float[]': 'const std::vector<float>&',
+            'double': 'double',
+            'bool': 'bool',
+            'bool[]': 'const std::vector<bool>&',
+            'str': 'const std::string&',
+            'str[]': 'const std::vector<std::string>&',
+            'Place': 'const Place&',
+            'DataLayout': 'DataLayout',
+            'DataType': 'DataType',
+            'int64_t[]': 'const std::vector<int64_t>&',
+            'int[]': 'const std::vector<int>&',
+        }
+        optional_types_trans = {
+            'Tensor': 'const paddle::optional<Tensor>&',
+            'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+            'int': 'paddle::optional<int>',
+            'int32_t': 'paddle::optional<int32_t>',
+            'int64_t': 'paddle::optional<int64_t>',
+            'float': 'paddle::optional<float>',
+            'double': 'paddle::optional<double>',
+            'bool': 'paddle::optional<bool>',
+            'Place': 'paddle::optional<const Place&>',
+            'DataLayout': 'paddle::optional<DataLayout>',
+            'DataType': 'paddle::optional<DataType>',
+        }
+
+        inputs = {'names': [], 'input_info': {}}
+        for input_dict in inputs_list:
+            inputs['names'].append(input_dict['name'])
+            if input_dict['optional']:
+                inputs['input_info'][input_dict['name']] = optional_types_trans[
+                    input_dict['typename']
+                ]
+            else:
+                inputs['input_info'][input_dict['name']] = input_types_map[
+                    input_dict['typename']
+                ]
+
+        attrs = {'names': [], 'attr_info': {}}
+        for attr_dict in attrs_list:
+            attrs['names'].append(attr_dict['name'])
+            if 'default_value' in attr_dict.keys():
+                default_value = attr_dict['default_value']
+            else:
+                default_value = None
+
+            if 'optional' in attr_dict.keys():
+                attrs['attr_info'][attr_dict['name']] = (
+                    optional_types_trans[attr_dict['typename']],
+                    default_value,
+                )
+            else:
+                attrs['attr_info'][attr_dict['name']] = (
+                    attr_types_map[attr_dict['typename']],
+                    default_value,
+                )
+        return inputs, attrs
+
+    def parse_output(self, outputs_list):
+
+        out_type_list = []
+        out_name_list = []
+        out_size_expr_list = []
+        for output_dict in outputs_list:
+            if output_dict['intermediate']:
+                continue
+            out_type_list.append(output_dict['typename'])
+            out_name_list.append(output_dict['name'])
+            if 'size' in output_dict.keys():
+                out_size_expr_list.append(output_dict['size'])
+            else:
+                out_size_expr_list.append(None)
+        return out_type_list, out_name_list, out_size_expr_list
+
+
+class EagerPrimAPI(BaseAPI):
+    def __init__(self, api_item_yaml):
+        super().__init__(api_item_yaml)
+
+    def get_api__func_name(self):
+        api_func_name = self.api
+        # if self.is_inplace:
+        #     if api_func_name[-1] != '_':
+        #         api_func_name += '_'
+        # print("after api name", api_func_name)
+        return api_func_name
+
+    def gene_prim_api_declaration(self):
+        api_declaration = ""
+        api_func_name = self.get_api__func_name()
+        if api_func_name[-1] != '_':
+            api_declaration = f"""
+template <typename T>
+{self.get_return_type()} {api_func_name}({self.get_declare_args()});
+"""
+        else:
+            api_declaration = (
+                api_declaration
+                + f"""
+template <typename T>
+{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
+"""
+            )
+
+        return api_declaration
+
+    def get_ad_func_input_args(self, inplace_flag=False):
+        input_args = []
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(name)
+            else:
+                input_args.append(name)
+        return input_args
+
+    def get_ad_func_args(self, inplace_flag=False):
+        ad_func_args = self.get_ad_func_input_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            ad_func_args.append(name)
+
+        ad_func_args_str = ", ".join(ad_func_args)
+        return ad_func_args_str
+
+    def gene_ad_func_call(self):
+        api_func_name = self.get_api__func_name()
+
+        dygraph_ad_func_name = '::' + api_func_name + '_ad_func'
+        dygraph_ad_func_parameters = self.get_ad_func_args()
+
+        ad_func_call_str = f"""
+VLOG(4) << "Eager Prim API {api_func_name}_ad_func call";
+return {dygraph_ad_func_name}({dygraph_ad_func_parameters});
+"""
+        # print("ad_func_call_str: ", ad_func_call_str)
+        return ad_func_call_str
+
+    def gene_eager_prim_api_code(self):
+        api_code = ""
+        indent = "  "
+        api_func_name = self.get_api__func_name()
+        template = '<Tensor>'
+        # func decalaration
+        if api_func_name[-1] != '_':
+            api_code = f"""
+template <>
+{self.get_return_type()} {api_func_name}{template}({self.get_declare_args_nodefault()})
+"""
+        else:
+            api_code = f"""
+template <>
+{self.get_return_type(inplace_flag=True)} {api_func_name}{template}({self.get_declare_args_nodefault(inplace_flag=True)})
+"""
+        # func code
+
+        api_code = api_code + '{'
+        api_code += f"""{self.gene_ad_func_call()}"""
+        api_code += '}' + '\n'
+
+        return api_code
diff --git a/paddle/fluid/prim/api/auto_code_generated/prim_gen.py b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bc59df4f33d2de7bdbf76737461f0b848865c36
--- /dev/null
+++ b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import yaml
+from prim_base import EagerPrimAPI
+
+
+def header_include():
+    return """
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def eager_source_include(header_file_path):
+    return """
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
+"""
+
+
+def api_namespace():
+    return (
+        """
+namespace paddle {
+namespace prim {
+""",
+        """
+using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::Scalar;
+using IntArray = paddle::experimental::IntArray;
+using DataType = paddle::experimental::DataType;
+""",
+        """
+}  // namespace prim
+}  // namespace paddle
+""",
+    )
+
+
+def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path):
+    apis = []
+
+    for each_api_yaml in api_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                apis.extend(api_list)
+
+    header_file = open(header_file_path, 'w')
+    eager_prim_source_file = open(eager_prim_source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+    header_file.write(namespace[1])
+    include_header_file = (
+        "#include paddle/fluid/prim/api/generated/prim_api/prim_api.h"
+    )
+    eager_prim_source_file.write(eager_source_include(include_header_file))
+    eager_prim_source_file.write(namespace[0])
+
+    for api in apis:
+        prim_api = EagerPrimAPI(api)
+        if prim_api.is_prim_api:
+            header_file.write(prim_api.gene_prim_api_declaration())
+            eager_prim_source_file.write(prim_api.gene_eager_prim_api_code())
+
+    header_file.write(namespace[2])
+    eager_prim_source_file.write(namespace[2])
+
+    header_file.close()
+    eager_prim_source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files'
+    )
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to api yaml file',
+        nargs='+',
+        default=['paddle/phi/api/yaml/ops.yaml'],
+    )
+
+    parser.add_argument(
+        '--prim_api_header_path',
+        help='output of generated prim_api header code file',
+        default='paddle/fluid/prim/api/generated/prim_api/prim_api.h',
+    )
+
+    parser.add_argument(
+        '--eager_prim_api_source_path',
+        help='output of generated eager_prim_api source code file',
+        default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc',
+    )
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    prim_api_header_file_path = options.prim_api_header_path
+    eager_prim_api_source_file_path = options.eager_prim_api_source_path
+
+    generate_api(
+        api_yaml_path,
+        prim_api_header_file_path,
+        eager_prim_api_source_file_path,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddle/fluid/prim/api/generated/CMakeLists.txt b/paddle/fluid/prim/api/generated/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a1b75527c20b49d688bde9ea120a74046a411123
--- /dev/null
+++ b/paddle/fluid/prim/api/generated/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(prim_api)
diff --git a/paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt b/paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt
rename to paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt
diff --git a/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc b/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc
similarity index 85%
rename from paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc
rename to paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc
index 0bf14b5955ba5c028d40eb38d6387a9a233e592e..fd309750ed6014048421d370501bad0a1fe71eff 100644
--- a/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc
+++ b/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
 #include "paddle/fluid/prim/api/manual/utils/utils.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
@@ -37,7 +38,7 @@ namespace paddle {
 namespace prim {
 
 template <>
-Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
+Tensor pow<DescTensor>(const Tensor& x, const Scalar& y) {
   Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
   framework::OpDesc* op = block->AppendOp();
@@ -55,7 +56,7 @@ Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
 
 template <>
 Tensor scale<DescTensor>(const Tensor& x,
-                         const paddle::experimental::Scalar& scale,
+                         const Scalar& scale,
                          float bias,
                          bool bias_after_scale) {
   Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
@@ -95,63 +96,63 @@ Tensor multiply<DescTensor>(const Tensor& x, const Tensor& y) {
 }
 
 template <>
-Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
+Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
   Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
   framework::OpDesc* op = block->AppendOp();
-  op->SetType("expand_v2");
+  op->SetType("unsqueeze2");
   op->SetInput("X",
                {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
   op->SetOutput(
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
-  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
-  op->SetAttr("shape", new_shape);
+  std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
+  op->SetAttr("axes", new_shape);
   op->CheckAttrs();
   op->InferVarType(block);
   return out;
 }
 
 template <>
-Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
-  // Grad infershape
+Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
   Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
   framework::OpDesc* op = block->AppendOp();
-  op->SetType("elementwise_div");
+  op->SetType("expand_v2");
   op->SetInput("X",
                {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
-  op->SetInput("Y",
-               {std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
   op->SetOutput(
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
+  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
+  op->SetAttr("shape", new_shape);
   op->CheckAttrs();
   op->InferVarType(block);
-  op->InferShape(*block);
   return out;
 }
 
 template <>
-Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
+Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
+  // Grad infershape
   Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
   framework::OpDesc* op = block->AppendOp();
-  op->SetType("unsqueeze2");
+  op->SetType("elementwise_div");
   op->SetInput("X",
                {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
+  op->SetInput("Y",
+               {std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
   op->SetOutput(
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
-  std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
-  op->SetAttr("axes", new_shape);
   op->CheckAttrs();
   op->InferVarType(block);
+  op->InferShape(*block);
   return out;
 }
 
 template <>
-Tensor full<DescTensor>(paddle::experimental::IntArray shape,
-                        paddle::experimental::Scalar value,
-                        paddle::experimental::DataType dtype,
-                        paddle::platform::Place place) {
+Tensor full<DescTensor>(const IntArray& shape,
+                        const Scalar& value,
+                        DataType dtype,
+                        const Place& place) {
   // Grad infershape
   Tensor out = empty<DescTensor>({}, dtype, place);
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
@@ -159,9 +160,8 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
   op->SetType("fill_constant");
   op->SetAttr("shape", shape.GetData());
   PADDLE_ENFORCE_EQ(
-      ((dtype == paddle::experimental::DataType::FLOAT32) ||
-       (dtype == paddle::experimental::DataType::FLOAT64) ||
-       (dtype == paddle::experimental::DataType::FLOAT16)),
+      ((dtype == DataType::FLOAT32) || (dtype == DataType::FLOAT64) ||
+       (dtype == DataType::FLOAT16)),
       true,
       phi::errors::InvalidArgument(
           "We only support float32/float16 for full, but we got data type: %s",
@@ -177,9 +177,9 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
 }
 
 template <>
-Tensor sum<DescTensor>(Tensor x,
-                       paddle::experimental::IntArray axis,
-                       paddle::experimental::DataType dtype,
+Tensor sum<DescTensor>(const Tensor& x,
+                       const IntArray& axis,
+                       DataType dtype,
                        bool keepdim) {
   // Grad infershape
   Tensor out = empty<DescTensor>({}, dtype, paddle::Place());
@@ -199,12 +199,12 @@ Tensor sum<DescTensor>(Tensor x,
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
   op->CheckAttrs();
   op->InferVarType(block);
-  // TODO(jiabin): This may have runtime shape skip infershape for now.
+  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
   return out;
 }
 
 template <>
-Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
+Tensor reshape<DescTensor>(const Tensor& x, const IntArray& shape) {
   // Grad infershape
   Tensor out = empty<DescTensor>({}, x.dtype(), paddle::Place());
   framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
@@ -222,7 +222,23 @@ Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
   op->CheckAttrs();
   op->InferVarType(block);
-  // TODO(jiabin): This may have runtime shape skip infershape for now.
+  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
+  return out;
+}
+
+template <>
+Tensor exp<DescTensor>(const Tensor& x) {
+  Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
+  framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("exp");
+  op->SetInput("X",
+               {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
+  op->SetOutput(
+      "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
+  op->CheckAttrs();
+  op->InferVarType(block);
+  op->InferShape(*block);
   return out;
 }
 }  // namespace prim
diff --git a/paddle/fluid/prim/api/manual/CMakeLists.txt b/paddle/fluid/prim/api/manual/CMakeLists.txt
index 261f6dd486302c71ee7dbfbf2b4dd0d66ca81772..512d2b1553c8c94a06445f3c59c4b77d10d74032 100644
--- a/paddle/fluid/prim/api/manual/CMakeLists.txt
+++ b/paddle/fluid/prim/api/manual/CMakeLists.txt
@@ -1,2 +1 @@
-add_subdirectory(prim_api)
 add_subdirectory(utils)
diff --git a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h b/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
index fc276842b815ec8f1e8f36b3d9e4d8e73737ee73..31e09b34f16df11e82541d99795f07d5073aa538 100644
--- a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
@@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x,
           sum<T>(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
       auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
       set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
+
     } else {
       by_pass<T>(scale_out_grad, dy);
     }
@@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x,
           sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
       auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
       set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
     } else {
       by_pass<T>(out_grad, dx);
     }
@@ -84,7 +83,6 @@ void add_grad(const Tensor& x,
           sum<T>(out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
       auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
       set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
     } else {
       by_pass<T>(out_grad, dy);
     }
@@ -97,7 +95,6 @@ void add_grad(const Tensor& x,
           sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
       auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
       set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
     } else {
       by_pass<T>(out_grad, dx);
     }
@@ -139,7 +136,6 @@ void sum_grad(const Tensor& x,
     x_grad_tmp = expand<T>(out_grad, x_dim);
   }
   set_output<T>(x_grad_tmp, x_grad);
-  // x_grad->set_impl(x_grad_tmp.impl());
 }
 
 template <typename T>
@@ -163,10 +159,8 @@ void divide_grad(const Tensor& x,
           sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false);
       auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
       set_output<T>(dy_tmp, dy);
-      // dy->set_impl(dy_tmp.impl());
     } else {
       set_output<T>(dy_res, dy);
-      // dy->set_impl(dy_res.impl());
     }
   }  // indicate we will compute dy
   if (dx) {
@@ -181,10 +175,8 @@ void divide_grad(const Tensor& x,
           sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false);
       auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
       set_output<T>(dx_tmp, dx);
-      // dx->set_impl(dx_tmp.impl());
     } else {
       set_output<T>(dx_res, dx);
-      // dx->set_impl(dx_res.impl());
     }
   }  // indicate we will compute dx
 }
@@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
     auto tmp = divide<T>(div_x, out);
     auto x_grad_tmp = multiply<T>(out_grad, tmp);
     set_output<T>(x_grad_tmp, x_grad);
-    // x_grad->set_impl(x_grad_tmp.impl());
   }
 }
 }  // namespace prim
diff --git a/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc b/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc
deleted file mode 100644
index 7dac02ea5b203e45adf5166602d6b41d3752194f..0000000000000000000000000000000000000000
--- a/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/api/all.h"
-#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
-#include "paddle/phi/capi/include/wrapper_base.h"
-namespace paddle {
-namespace prim {
-template <>
-Tensor pow<Tensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
-  return ::pow_ad_func(x, y);
-}
-
-template <>
-Tensor scale<Tensor>(const Tensor& x,
-                     const paddle::experimental::Scalar& scale,
-                     float bias,
-                     bool bias_after_scale) {
-  return ::scale_ad_func(x, scale, bias, bias_after_scale);
-}
-
-template <>
-Tensor multiply<Tensor>(const Tensor& x, const Tensor& y) {
-  return ::multiply_ad_func(x, y);
-}
-
-template <>
-Tensor expand<Tensor>(const Tensor& x, const IntArray& shape) {
-  return ::expand_ad_func(x, shape);
-}
-
-template <>
-Tensor unsqueeze<Tensor>(const Tensor& x, const IntArray& axis) {
-  return ::unsqueeze_ad_func(x, axis);
-}
-
-template <>
-Tensor divide<Tensor>(const Tensor& x, const Tensor& y) {
-  return ::divide_ad_func(x, y);
-}
-
-template <>
-Tensor full<Tensor>(paddle::experimental::IntArray shape,
-                    paddle::experimental::Scalar value,
-                    paddle::experimental::DataType dtype,
-                    paddle::platform::Place place) {
-  return ::full_ad_func(shape, value, dtype, place);
-}
-template <>
-Tensor sum<Tensor>(Tensor x, IntArray axis, DataType dtype, bool keepdim) {
-  return ::sum_ad_func(x, axis, dtype, keepdim);
-}
-
-template <>
-Tensor reshape<Tensor>(Tensor x, IntArray shape) {
-  return ::reshape_ad_func(x, shape);
-}
-}  // namespace prim
-}  // namespace paddle
diff --git a/paddle/fluid/prim/api/manual/prim_api/prim_api.h b/paddle/fluid/prim/api/manual/prim_api/prim_api.h
index 5465cdb601e9557be56ddd8efd5640ae95abbc19..65d411d86307ded238a4bc07e6336659663ca406 100644
--- a/paddle/fluid/prim/api/manual/prim_api/prim_api.h
+++ b/paddle/fluid/prim/api/manual/prim_api/prim_api.h
@@ -12,50 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// prim api which can't be generated
 #pragma once
+
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/optional.h"
-namespace paddle {
-namespace prim {
-using Tensor = paddle::experimental::Tensor;
-using IntArray = paddle::experimental::IntArray;
-using Scalar = paddle::experimental::Scalar;
-
-template <typename T>
-Tensor pow(const Tensor& x, const Scalar& y);
-
-template <typename T>
-Tensor scale(const Tensor& X,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale);
-
-template <typename T>
-Tensor multiply(const Tensor& x, const Tensor& y);
-
-template <typename T>
-Tensor expand(const Tensor& x, const IntArray& shape);
 
-template <typename T>
-Tensor unsqueeze(const Tensor& x, const IntArray& axis);
-
-template <typename T>
-Tensor divide(const Tensor& x, const Tensor& y);
-
-template <typename T>
-Tensor full(IntArray shape,
-            Scalar value,
-            DataType dtype = DataType::FLOAT32,
-            Place place = CPUPlace());
-
-template <typename T>
-Tensor sum(Tensor x,
-           IntArray axis = {},
-           DataType dtype = DataType::UNDEFINED,
-           bool keepdim = false);
-
-template <typename T>
-Tensor reshape(Tensor x, IntArray shape);
-}  // namespace prim
+namespace paddle {
+namespace prim {}  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/prim/api/manual/utils/utils.h b/paddle/fluid/prim/api/manual/utils/utils.h
index 69d879e37b0b2dd3271967cb91acb322f354af1a..20b02f2df9c79235d645f13a2a3cce8f8ff08d67 100644
--- a/paddle/fluid/prim/api/manual/utils/utils.h
+++ b/paddle/fluid/prim/api/manual/utils/utils.h
@@ -16,11 +16,12 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
-using IntArray = paddle::experimental::IntArray;
+
 namespace paddle {
 namespace prim {
 // We put some api like utils here
@@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp,
                 paddle::experimental::Tensor* x);
 
 // These method don't need to be specified
-static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
-                                 const phi::DDim& y_dims) {
+static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
+                                          const phi::DDim& in_dims) {
   std::vector<int64_t> result;
-  PADDLE_ENFORCE_GE(phi::product(x_dims),
-                    phi::product(y_dims),
-                    phi::errors::InvalidArgument(
-                        "Only x_dims >= y_dims is accepted for "
-                        "get_reduce_dims, but we got x_dims: %s, y_dims: %s",
-                        x_dims,
-                        y_dims));
-  int bat = x_dims.size() - y_dims.size();
+  int bat = dout_dims.size() - in_dims.size();
   for (int i = 0; i < bat; ++i) {
     result.push_back(i);
   }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    if (y_dims[i] == 1) {
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (in_dims[i] == 1) {
       result.push_back(i + bat);
     } else {
       PADDLE_ENFORCE_EQ(
-          y_dims[i],
-          x_dims[i + bat],
+          in_dims[i],
+          dout_dims[i + bat],
           platform::errors::InvalidArgument(
               "ReduceDims dimension mismatch. Operands could "
-              "not be broadcast together with the shape of x_dims = [%s] and "
-              "the shape of y_dims = [%s]. Received [%d] in X is not equal to "
+              "not be broadcast together with the shape of dout = [%s] and "
+              "the shape of in_dims = [%s]. Received [%d] in X is not equal to "
               "[%d] in Y at i:%d.",
-              x_dims,
-              y_dims,
-              x_dims[i + bat],
-              y_dims[i],
+              dout_dims,
+              in_dims,
+              dout_dims[i + bat],
+              in_dims[i],
               i));
     }
   }
-  auto res_dims = phi::make_ddim(result);
-  VLOG(4) << "Reduce Dims is: " << res_dims;
-  return res_dims;
+  return phi::make_ddim(result);
+}
+
+static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
+                                 const phi::DDim& y_dims) {
+  auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims);
+  return get_reduce_dims_from_out(out_dims, x_dims);
 }
+
 }  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index e053d1465e70d53a40920b73b9e7a959eef1b8dc..c2e7ca4ec57e2b11f6ce76a549408bcabdfbd1be 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -477,6 +477,9 @@ class GradCompositeOpMakerBase {
   void RecoverOutputName(const paddle::experimental::Tensor& output,
                          const std::string& origin_name) {
     if (origin_name == framework::kEmptyVarName) return;
+    VLOG(4) << "Recover: "
+            << static_cast<prim::DescTensor*>(output.impl().get())->Name()
+            << " To: " << origin_name;
     prim::StaticCompositeContext::Instance().GetBlock()->RenameVar(
         static_cast<prim::DescTensor*>(output.impl().get())->Name(),
         origin_name);
diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
index c599346bdb7a89d2f0032cf0e4ecfe3da998026b..1deb20fbf9b88bf283c5f1af5ddecd95473fb541 100644
--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self,
                                        PyObject *kwargs) {
   PyThreadState *tstate = nullptr;
   try {
-    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
     auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
-    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
     auto OutScope =
         GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
     auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 4b9f2c295e9528ca4a9d9b5b6134668d7cb7ce2f..34132e199e7326059dddd3970ab571ccc52dd22d 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self,
                                                PyObject* args,
                                                PyObject* kwargs) {
   EAGER_TRY
-  using Strings = std::vector<std::string>;
+  using Strings = paddle::framework::Strings;
   auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0);
   auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
   *var_tensor->GetMutable<Strings>() = strings;
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index a86271dfbf532eb5d1a7d8e2ea932586e4cc3608..c328b1258cdeb7b49b486185fbbf5fb1c54e7163 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -184,39 +184,41 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
       value[i] = ddim[i];
     }
   }
-
-  auto desired_layout =
-      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-  auto default_layout =
-      paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
-  bool change_dim =
-      (desired_layout != default_layout &&
-       self->tensor.layout() == desired_layout && value.size() == 4);
-  VLOG(6) << "eager_properties 'Shape' method, layout autotune "
-          << " desired_layout: " << desired_layout
-          << " default_layout: " << default_layout
-          << " tensor layout: " << self->tensor.layout()
-          << " tensor's shape size is : " << value.size();
-  std::vector<int64_t> dims = value;
-  if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
-    // NCHW -> NHWC
-    VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
-            << value[1] << " " << value[2] << " " << value[3] << " to "
-            << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
-    value[0] = dims[0];
-    value[1] = dims[2];
-    value[2] = dims[3];
-    value[3] = dims[1];
-  } else if (change_dim && phi::DataLayoutToString(desired_layout) == "NHWC") {
-    // NHWC -> NCHW
-    VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " "
-            << value[1] << " " << value[2] << " " << value[3] << " to "
-            << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
-            << " " << dims[1];
-    value[0] = dims[0];
-    value[1] = dims[3];
-    value[2] = dims[1];
-    value[3] = dims[2];
+  if (!egr::IsVariableCompatTensor(self->tensor)) {
+    auto desired_layout =
+        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+    auto default_layout =
+        paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
+    bool change_dim =
+        (desired_layout != default_layout &&
+         self->tensor.layout() == desired_layout && value.size() == 4);
+    VLOG(6) << "eager_properties 'Shape' method, layout autotune "
+            << " desired_layout: " << desired_layout
+            << " default_layout: " << default_layout
+            << " tensor layout: " << self->tensor.layout()
+            << " tensor's shape size is : " << value.size();
+    std::vector<int64_t> dims = value;
+    if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
+      // NCHW -> NHWC
+      VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0]
+              << " " << value[1] << " " << value[2] << " " << value[3] << " to "
+              << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
+      value[0] = dims[0];
+      value[1] = dims[2];
+      value[2] = dims[3];
+      value[3] = dims[1];
+    } else if (change_dim &&
+               phi::DataLayoutToString(desired_layout) == "NHWC") {
+      // NHWC -> NCHW
+      VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0]
+              << " " << value[1] << " " << value[2] << " " << value[3] << " to "
+              << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
+              << " " << dims[1];
+      value[0] = dims[0];
+      value[1] = dims[3];
+      value[2] = dims[1];
+      value[3] = dims[2];
+    }
   }
 
   return ToPyObject(value);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 9262fec62b989d3643904cc33658a68f795c5f74..e01044720571d088899a8e60b02b6bd8bb5304c1 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) {
 
   m.def("_cleanup_mmap_fds",
         []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
+
+  m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) {
+    memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize(
+        size);
+  });
 #endif
 
   m.def("start_imperative_gperf_profiler",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c7bc5722e3804d49bc7d89210b445f456f4fc4ba..43ee2d479b0b76b0d6851fe2c1b58e06e977fb76 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle.
              }
            })
       .def("set_string_list",
-           [](Variable &self, Strings str_list) {
+           [](Variable &self, std::vector<std::string> str_list) {
              *self.GetMutable<Strings>() = str_list;
            })
       .def("set_vocab",
@@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_feed_variable",
         static_cast<void (*)(  // NOLINT
             Scope *,
-            const Strings &,
+            const std::vector<std::string> &,
             const std::string &,
             size_t)>(&framework::SetFeedVariable));
   m.def("get_fetch_variable",
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 8739b32965b0dea4e163091f6cbed321ba9590d4..4bdde24f431bc830171ea4d3c02d1064da67926b 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -182,6 +182,7 @@ limitations under the License. */
 #include "pybind11/stl.h"
 
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_shm_cache);
 
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                int flags = memory::allocation::MAPPED_SHAREDMEM |
                            memory::allocation::MAPPED_EXCLUSIVE;
                std::string handle = memory::allocation::GetIPCName();
+               int find_id = -1;
+               if (FLAGS_use_shm_cache) {
+                 find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT
+               }
+               if (find_id != -1) {
+                 handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
+               }
                auto shared_holder =
                    memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                       handle, flags, data_size);
+                       handle, flags, data_size, find_id);
 
                // copy data & reset holder
                if (platform::is_cuda_pinned_place(holder->place())) {
@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              size_t size = t[1].cast<size_t>();
              int flags = memory::allocation::MAPPED_SHAREDMEM |
                          memory::allocation::MAPPED_NOCREATE;
-
+             int find_id = -1;
+             if (FLAGS_use_shm_cache) {
+               find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
+             }
              auto shared_holder =
                  memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                     ipc_name, flags, size);
+                     ipc_name, flags, size, find_id);
 
              // 3. Rebuild Tensor
              tensor.ResetHolderWithType(
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 074da80bbfb6e9d09eb2f5c87094be103bd68c27..73569b387312c7278fd9840dfb362cf752012c0a 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/string_tensor_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/backends/device_manager.h"
+#endif
 
 namespace paddle {
 namespace experimental {
@@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) {
 
 BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
   if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (t.place().GetType() == AllocationType::CUSTOM) {
+      phi::DeviceManager::SetDevice(t.place());
+    }
+#endif
     phi::Backend backend_key = phi::TransToPhiBackend(t.place());
     BackendSet backend_set(backend_key);
     if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 277ef04c6888b91a83690b4f2cc2b2e9d390d2b9..23158d794019f6313762e1fcebb9aa09af204104 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -431,6 +431,7 @@
   kernel :
     func : exp_grad
   inplace : (out_grad -> x_grad)
+  composite : exp_grad(out, out_grad, x_grad)
 
 - backward_op : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
@@ -1085,6 +1086,30 @@
     func : selu_grad
     data_type : out
 
+- backward_op : send_u_recv_grad
+  forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
+  args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param : [x]
+  kernel :
+    func : send_u_recv_grad
+    data_type : out_grad
+  optional: out, dst_count
+
+- backward_op : send_ue_recv_grad
+  forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) -> Tensor(out), Tensor(dst_count)
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : send_ue_recv_grad
+    data_type : out_grad
+  optional: out, dst_count
+
 - backward_op : send_uv_grad
   forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
   args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
@@ -1191,6 +1216,17 @@
     func : sinh_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : slogdet_grad
+  forward : slogdet (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param : [x]
+  kernel :
+    func : slogdet_grad
+    data_type : out_grad
+
 - backward_op : softplus_grad
   forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float beta, float threshold)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 3ef309656735de646ffdda08ddc0d9489bc20ab4..f47e206c7ce2fe2742529382ef18092f92571cde 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -316,9 +316,14 @@
 
 - backward_op : cumsum_grad
   forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
-  args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
+  args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(x_grad)
-  invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : cumsum_grad
+    data_type: x
 
 - backward_op : deformable_conv_grad
   forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out)
@@ -475,6 +480,7 @@
     func : expand_grad
   no_need_buffer : x
   backward : expand_double_grad
+  composite: expand_grad(x, out_grad, shape, x_grad_p)
 
 - backward_op : exponential__grad
   forward : exponential_ (Tensor x, float lam) -> Tensor(out)
@@ -880,6 +886,7 @@
     param : [x, y]
   kernel :
     func : multiply_grad
+  composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
   backward : multiply_double_grad
 
 - backward_op : multiply_triple_grad
@@ -1201,30 +1208,6 @@
     data_type : x
   optional : summed_ids
 
-- backward_op : send_u_recv_grad
-  forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
-  args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
-  output : Tensor(x_grad)
-  infer_meta :
-    func : GeneralUnaryGradInferMeta
-    param : [x]
-  kernel :
-    func : send_u_recv_grad
-    data_type : out_grad
-  optional: out, dst_count
-
-- backward_op : send_ue_recv_grad
-  forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
-  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [x, y]
-  kernel :
-    func : send_ue_recv_grad
-    data_type : out_grad
-  optional: out, dst_count
-
 - backward_op : sigmoid_cross_entropy_with_logits_grad
   forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out)
   args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index)
@@ -1260,16 +1243,6 @@
   backward : slice_double_grad
   no_need_buffer : input
 
-- backward_op : slogdet_grad
-  forward : slogdet (Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : slogdet_grad
-
 - backward_op : softmax_grad
   forward : softmax (Tensor x, int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, int axis)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 52db798aecc8f97026417d21ada3ca9345415ec6..049d86473cfc5b5c2c4aea411d783914ced2d9c5 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1298,16 +1298,6 @@
   kernel :
     func : not_equal
 
-- op : numel
-  args : (Tensor x)
-  output : Tensor(size)
-  infer_meta :
-    func : NumelInferMeta
-  kernel :
-    func : numel
-  data_transform:
-    skip_transform : x
-
 - op : one_hot
   args : (Tensor x, Scalar(int) num_classes)
   output : Tensor(out)
@@ -1588,28 +1578,6 @@
     data_type : x
   backward : segment_pool_grad
 
-- op : send_u_recv
-  args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
-  output : Tensor(out), Tensor(dst_count)
-  infer_meta :
-    func : SendURecvInferMeta
-  kernel :
-    func : send_u_recv
-    data_type : x
-  intermediate : dst_count
-  backward : send_u_recv_grad
-
-- op : send_ue_recv
-  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
-  output : Tensor(out), Tensor(dst_count)
-  infer_meta :
-    func : SendUERecvInferMeta
-  kernel :
-    func : send_ue_recv
-    data_type : x
-  intermediate : dst_count
-  backward : send_ue_recv_grad
-
 - op : sgd_
   args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
   output : Tensor(param_out), Tensor(master_param_out)
@@ -1663,15 +1631,6 @@
     func : slice
   backward : slice_grad
 
-- op : slogdet
-  args : (Tensor x)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : slogdet
-  backward : slogdet_grad
-
 - op : softmax
   args : (Tensor x, int axis)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index c2b22ba7af5a341f7e436facedc63e62d925de00..1fc4144849b2ac048488f9cfb284ef3970d6a774 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -993,6 +993,12 @@
   outputs :
     {out : Out, total_weight : Total_weight}
 
+- op : numel(size)
+  inputs :
+    x : Input
+  outputs :
+    size : Out
+
 - op : overlap_add
   backward : overlap_add_grad
   inputs :
@@ -1215,6 +1221,28 @@
   outputs :
     out : Out
 
+- op : send_u_recv(graph_send_recv)
+  backward : send_u_recv_grad(graph_send_recv_grad)
+  inputs :
+    {x : X, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
+- op : send_ue_recv(graph_send_ue_recv)
+  backward : send_ue_recv_grad(graph_send_ue_recv_grad)
+  inputs :
+    {x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
 - op : send_uv (graph_send_uv)
   backward : send_uv_grad (graph_send_uv_grad)
 
@@ -1286,6 +1314,13 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
+- op : slogdet(slogdeterminant)
+  backward : slogdet_grad(slogdeterminant_grad)
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
 - op : softmax
   backward : softmax_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 8ea815a28d46550180f2bf46518d7b787980cc37..df606ebec0734ea1220bb53ba501624bba94aba0 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -871,6 +871,18 @@
   kernel :
     func : npu_identity
 
+- op : numel
+  args : (Tensor x)
+  output : Tensor(size)
+  infer_meta :
+    func : NumelInferMeta
+  kernel :
+    func : numel
+    data_type : x
+  data_transform:
+    skip_transform : x
+  no_need_buffer : x
+
 - op : overlap_add
   args: (Tensor x, int hop_length, int axis=-1)
   output: Tensor
@@ -1029,6 +1041,28 @@
     func : selu
   backward : selu_grad
 
+- op : send_u_recv
+  args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
+  output : Tensor(out), Tensor(dst_count)
+  infer_meta :
+    func : SendURecvInferMeta
+  kernel :
+    func : send_u_recv
+    data_type : x
+  intermediate : dst_count
+  backward : send_u_recv_grad
+
+- op : send_ue_recv
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0})
+  output : Tensor(out), Tensor(dst_count)
+  infer_meta :
+    func : SendUERecvInferMeta
+  kernel :
+    func : send_ue_recv
+    data_type : x
+  intermediate : dst_count
+  backward : send_ue_recv_grad
+
 - op : send_uv
   args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
   output : Tensor(out)
@@ -1083,6 +1117,15 @@
     func : sinh
   backward : sinh_grad
 
+- op : slogdet
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : slogdet
+  backward : slogdet_grad
+
 - op : softplus
   args : (Tensor x, float beta = 1.0, float threshold = 20.0f)
   output : Tensor
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index f7888be82b309a2da760661d7020f715fe7e4d63..367231972acbcf6a504ab8dc36b20e5763cf9b0b 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() {
       {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
       {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"logical_not", XPUKernelSet({phi::DataType::BOOL})},
       {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
       {"masked_select",
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 4065306abc7989990626b6265025aaf4fe2d2763..7b864198129f9a71feac9b03b3bb083c5e3422ef 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -61,11 +61,13 @@ struct XPUContext::Impl {
   ~Impl() {
     if (owned_ && context_ != nullptr) {
       backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-      // manually destroy XPUStream here until xpu::api integrates this work
-      // into Context dtor
       xpu_wait(context_->xpu_stream);
-      xpu_stream_destroy(context_->xpu_stream);
-      context_->xpu_stream = nullptr;
+      if (context_->xpu_stream) {
+        // manually destroy XPUStream here until xpu::api integrates this work
+        // into Context dtor
+        xpu_stream_destroy(context_->xpu_stream);
+        context_->xpu_stream = nullptr;
+      }
       xpu::destroy_context(context_);
       context_ = nullptr;
     }
@@ -73,11 +75,7 @@ struct XPUContext::Impl {
 
   const Place& GetPlace() const { return place_; }
 
-  XPUStream stream() const {
-    auto s = context_->xpu_stream;
-    PD_CHECK(s != nullptr, "the xpu stream is nullptr.");
-    return s;
-  }
+  XPUStream stream() const { return context_->xpu_stream; }
 
   xpu::Context* GetXContext() const {
     PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
@@ -103,13 +101,20 @@ struct XPUContext::Impl {
     context_ = xpu::create_context();
     xpu_version_ = backends::xpu::get_xpu_version(place_.device);
     SetL3Cache();
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
   }
 
   void SetXContext(xpu::Context* context) { context_ = context; }
 
   void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
 
+  void CreateStream() {
+    if (context_->xpu_stream) {
+      VLOG(3) << "xpu stream is already created for current context";
+      return;
+    }
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
+  }
+
   bool owned_{false};
   Place place_;
   backends::xpu::XPUVersion xpu_version_;
@@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
   impl_->SetBkclContext(context);
 }
 
+void XPUContext::CreateStream() { impl_->CreateStream(); }
+
 void XPUContext::Init() { impl_->Init(); }
 
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 1c12c7e5fe69a490b19e21ed8a1646422593a433..731a3e16c42621b673c7cef8da3166209cc60150 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -46,6 +46,7 @@ class XPUContext : public DeviceContext,
   // Return bkcl context.
   xpu::BKCLContext_t bkcl_context() const;
   void SetBkclContext(xpu::BKCLContext_t context);
+  void CreateStream();
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 2ddafdac520eb72b6dee5a0ebb161b78b51b0813..cd9c24436da97eadd772100e99f71da441317d9e 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
       if (!device_type.empty()) {
         os << device_type;
       } else {
@@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::IPU;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
-                                phi::GetOrRegisterGlobalDeviceTypeId(s));
+                                phi::CustomRegisteredDeviceMap::Instance()
+                                    .GetOrRegisterGlobalDeviceTypeId(s));
   }
 }
 
@@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) {
     default:
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
       if (!device_type.empty()) {
         return device_type;
       } else {
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index d2719f4a0732a4b08b4352bf15bab913703e8a0e..30346d8727f64e3ed0ca8fed10b1b5c194d144c0 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
-#include <unordered_map>
 
 #include "glog/logging.h"
 #include "paddle/phi/api/ext/exception.h"
@@ -54,7 +53,8 @@ std::string Place::DebugString() const {
   std::ostringstream os;
   os << "Place(";
   if (alloc_type_ == AllocationType::CUSTOM) {
-    os << GetGlobalDeviceType(device_type_id_);
+    os << phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+        device_type_id_);
   } else {
     os << AllocationTypeStr(alloc_type_);
   }
@@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) {
   }
 }
 
-static std::unordered_map<std::string, size_t> global_registered_device_type_id;
-static std::unordered_map<size_t, std::string> global_registered_device_type;
+CustomRegisteredDeviceMap &CustomRegisteredDeviceMap::Instance() {
+  static CustomRegisteredDeviceMap g_custom_registered_device_map;
+  return g_custom_registered_device_map;
+}
 
-size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
+size_t CustomRegisteredDeviceMap::GetOrRegisterGlobalDeviceTypeId(
+    const std::string &device_type) {
   if (device_type.empty()) return 0;
-  if (global_registered_device_type_id.find(device_type) ==
-      global_registered_device_type_id.end()) {
-    size_t device_type_id = global_registered_device_type_id.size() + 1;
-    global_registered_device_type_id[device_type] = device_type_id;
-    global_registered_device_type[device_type_id] = device_type;
+  if (registered_device_type_id_.find(device_type) ==
+      registered_device_type_id_.end()) {
+    size_t device_type_id = registered_device_type_id_.size() + 1;
+    registered_device_type_id_[device_type] = device_type_id;
+    registered_device_type_[device_type_id] = device_type;
   }
-  return global_registered_device_type_id[device_type];
+  return registered_device_type_id_[device_type];
 }
 
-std::string GetGlobalDeviceType(size_t device_type_id) {
-  if (global_registered_device_type.find(device_type_id) ==
-      global_registered_device_type.end())
+std::string CustomRegisteredDeviceMap::GetGlobalDeviceType(
+    size_t device_type_id) {
+  if (registered_device_type_.find(device_type_id) ==
+      registered_device_type_.end())
     return "";
-  return global_registered_device_type[device_type_id];
+  return registered_device_type_[device_type_id];
 }
 
 constexpr static int kAllocationTypeBitLength = 8;
@@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType(
 Place::Place(paddle::PlaceType type)
     : device(detail::GetCorrectDeviceIdByPlaceType(type)),
       alloc_type_(static_cast<AllocationType>(type)),
-      device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) {
+      device_type_id_(
+          CustomRegisteredDeviceMap::Instance().GetOrRegisterGlobalDeviceTypeId(
+              "")) {
   LOG_FIRST_N(WARNING, 1)
       << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version "
          "2.3, and will be removed in version 2.4! Please use "
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 12aa6f90ed91636ca7e554bf13199cc74334ede3..a21a53776b6e11901e7a1c417892a6f5e8317d6f 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <unordered_map>
 
 #include "paddle/phi/api/include/dll_decl.h"
 
@@ -37,11 +38,21 @@ enum class AllocationType : int8_t {
   CUSTOM = 9,
 };
 
-const char* AllocationTypeStr(AllocationType type);
+class CustomRegisteredDeviceMap {
+ public:
+  static CustomRegisteredDeviceMap& Instance();
 
-size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
+  size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
 
-std::string GetGlobalDeviceType(size_t device_type_id_);
+  std::string GetGlobalDeviceType(size_t device_type_id_);
+
+ private:
+  CustomRegisteredDeviceMap() = default;
+  std::unordered_map<std::string, size_t> registered_device_type_id_;
+  std::unordered_map<size_t, std::string> registered_device_type_;
+};
+
+const char* AllocationTypeStr(AllocationType type);
 
 /// \brief The place is used to specify where the data is stored.
 class PADDLE_API Place {
@@ -53,12 +64,14 @@ class PADDLE_API Place {
                  const std::string& dev_type = "")
       : device(id),
         alloc_type_(type),
-        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+        device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
 
   explicit Place(AllocationType type, const std::string& dev_type = "")
       : device(0),
         alloc_type_(type),
-        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+        device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
 
   // See NOTE [ Why need to temporarily adapt to PlaceType? ]
   Place(paddle::PlaceType type);  // NOLINT
@@ -69,7 +82,8 @@ class PADDLE_API Place {
     alloc_type_ = type;
     device = device_id;
     if (!dev_type.empty()) {
-      device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
+      device_type_id_ = phi::CustomRegisteredDeviceMap::Instance()
+                            .GetOrRegisterGlobalDeviceTypeId(dev_type);
     }
   }
 
@@ -78,7 +92,8 @@ class PADDLE_API Place {
   int8_t GetDeviceId() const { return device; }
 
   std::string GetDeviceType() const {
-    return GetGlobalDeviceType(device_type_id_);
+    return phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+        device_type_id_);
   }
 
   std::string DebugString() const;
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 4e3447fe9eb22c498b4f33524ebff5e56a7e1e79..2dd3ba8b767027009877542bc15fac35ea72075c 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -110,6 +110,7 @@ class ArgumentMappingContext {
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
   virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
+  virtual bool IsSparseCooTensorOutput(const std::string& name) const = 0;
   virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
   // For compatibility with LoDTensorArray
   virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 03fb3f3e6b6b668a719a3c0af4976c26a826c2d2..149c62f11677e425c09e4de18836e86f95670c2c 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
     case AllocationType::CUSTOM:
       return static_cast<Backend>(
           static_cast<size_t>(Backend::NUM_BACKENDS) +
-          GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
+          phi::CustomRegisteredDeviceMap::Instance()
+              .GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
     default:
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported transform %s to phi Backend.", place));
@@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
-      std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
+      std::string device_type =
+          phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
+              device_type_id_);
       if (!device_type.empty()) {
         return phi::CustomPlace(
             device_type,
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 5b8dc47d6498177124bdc53be2d27f97546fe086..8b5a78575d220d7d04c7f1046237d79053f8cb30 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
 
 #endif
 
+/*
+ * CUDA Graph related FLAG
+ * Name: FLAGS_new_executor_use_cuda_graph
+ * Since Version: 2.4
+ * Value Range: bool, default=false
+ * Example: FLAGS_new_executor_use_cuda_graph=true would allow
+ * new executor to use CUDA Graph.
+ */
+PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
+                            false,
+                            "Use CUDA Graph in new executor");
+
 DEFINE_int32(record_pool_max_size,
              2000000,
              "SlotRecordDataset slot record pool max size");
@@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
 PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
                             false,
                             "Add a persistent ibuilder.");
+
+/**
+ * mmap_allocator related FLAG
+ * Name: use_shm_cache
+ * Since Version: 2.5.0
+ * Value Range: bool, default=true
+ * Example:
+ * Note: . If True, mmap_allocator will cache shm file to decrease munmap
+ * operation.
+ */
+PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
+                            true,
+                            "Use shm cache in mmap_allocator.");
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index a195f2ad60cbcb5fddc1e712e72c7ce263d75785..dc6f657fee67690b4b406facd1f9257a8b21d306 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -101,6 +101,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::ExtendedTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type == std::type_index(typeid(
                                  const std::vector<const ExtendedTensor*>&))) {
         args_def->AppendInput(default_key.backend(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 1eb3a52aebad11c571e645451415fbeca3edac4f..eb18d0cb98c5b8cc03af3e3804a0af9466d231b8 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -265,6 +265,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
   PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(ExtendedTensor);
   PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(ExtendedTensor);
   PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorBase);
   PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SelectedRows);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 002af75c04c1facb11f83d9c2f29374af7d97c41..561938adca80a22cc3700baab3dc58c8bf9a6321 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -381,7 +381,11 @@ void CompareAllInferMeta(const MetaTensor& x,
       errors::InvalidArgument(
           "The size of dim_y should not be greater than dim_x's."));
   out->share_lod(x);
-  out->set_dims(make_ddim({1}));
+  if (!x.dims().size() || !y.dims().size()) {
+    out->set_dims(make_ddim({}));
+  } else {
+    out->set_dims(make_ddim({1}));
+  }
   out->set_dtype(DataType::BOOL);
 }
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 04373fa29edf9e58661d8c52cf85d08be0ceaf13..5a7b2cf16a1f8cdc896da44193befe43674b23cd 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -424,10 +424,36 @@ void CumInferMeta(const MetaTensor& x,
     out->set_dims(phi::make_ddim({phi::product(x_dims)}));
     out->set_dtype(x.dtype());
   } else {
+    if (x_dims.size() > 0) {
+      PADDLE_ENFORCE_GE(
+          axis,
+          -x_dims.size(),
+          phi::errors::OutOfRange(
+              "axis is out of range (expected to be in range of [%ld, "
+              "%ld), but got %ld).",
+              -(x_dims.size()),
+              x_dims.size(),
+              axis));
+      PADDLE_ENFORCE_LT(
+          axis,
+          x_dims.size(),
+          phi::errors::OutOfRange(
+              "axis is out of range (expected to be in range of [%ld, "
+              "%ld), but got %ld).",
+              -(x_dims.size()),
+              x_dims.size(),
+              axis));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          (axis == 0 || axis == -1),
+          true,
+          errors::InvalidArgument("The axis must be -1 or 0 in 0D Tensor, "
+                                  "but the value given is %d.",
+                                  axis));
+    }
     out->set_dims(x_dims);
     out->set_dtype(x.dtype());
   }
-
   out->share_lod(x);
 }
 
diff --git a/paddle/phi/kernels/cpu/cum_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32be44661348fbf8e3e4e6713637af88c89fe560
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cum_grad_kernel.h"
+#include "paddle/phi/kernels/cum_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+namespace phi {
+
+template <typename T, typename Context>
+void CumsumGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool flatten,
+                      bool exclusive,
+                      bool reverse,
+                      DenseTensor* x_grad) {
+  x_grad->Resize(x.dims());
+  CumsumKernel<T, Context>(
+      dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumsumGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc
index 2b6a9be371afb6f2c64355e241c2611571bca395..f7ec5bbbf9e844fc51538641e180018c32b9e438 100644
--- a/paddle/phi/kernels/cpu/cum_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -57,6 +57,14 @@ void ScanKernel(const Context& dev_ctx,
                 bool reverse,
                 Reducer reducer,
                 DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 1) {
+    auto raw_dims = out->dims();
+    phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    out->Resize(raw_dims);
+    return;
+  }
   auto out_dims = out->dims();
 
   PADDLE_ENFORCE_EQ(
@@ -72,8 +80,6 @@ void ScanKernel(const Context& dev_ctx,
     axis += out_dims.size();
   }
 
-  dev_ctx.template Alloc<T>(out);
-
   int pre = 1;
   int post = 1;
   int mid = out_dims[axis];
diff --git a/paddle/phi/kernels/cum_grad_kernel.h b/paddle/phi/kernels/cum_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2428524fe5a2fa87bbe20ce417538e708fa8ab4
--- /dev/null
+++ b/paddle/phi/kernels/cum_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumsumGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool flatten,
+                      bool exclusive,
+                      bool reverse,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 81c4faeb8182fe61c3eaac890f3f2ff2bbc76da9..bf4553f3ab7b417fc2fa366336c91b1b92d1a054 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -381,7 +381,11 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                           &beta,
                                           out_descriptor.descriptor(),
                                           gpu_type,
+#if CUDA_VERSION >= 11040
+                                          CUSPARSE_SPMV_ALG_DEFAULT,
+#else
                                           CUSPARSE_MV_ALG_DEFAULT,
+#endif
                                           &buffer_size);
   });
 
@@ -399,7 +403,11 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                &beta,
                                out_descriptor.descriptor(),
                                gpu_type,
+#if CUDA_VERSION >= 11040
+                               CUSPARSE_SPMV_ALG_DEFAULT,
+#else
                                CUSPARSE_MV_ALG_DEFAULT,
+#endif
                                tmp_buffer_ptr);
   });
 }
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6039c313b78ed007f74c093111b76a0efe399f30
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cum_grad_kernel.h"
+#include "paddle/phi/kernels/cum_kernel.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumsumGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool flatten,
+                      bool exclusive,
+                      bool reverse,
+                      DenseTensor* x_grad) {
+  x_grad->Resize(x.dims());
+  CumsumKernel<T, Context>(
+      dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cumsum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
+#else
+PD_REGISTER_KERNEL(cumsum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index 0c6cd8b5562af4897238e37dc77901224d6a0621..9bf06d7bf19dcd763263447629d3521516bcf736 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -270,6 +270,16 @@ void ScanKernel(const Context& dev_ctx,
                 bool reverse,
                 Op op,
                 DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  // For 0D Tensor
+  if (out->numel() == 1) {
+    auto raw_dims = out->dims();
+    phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    out->Resize(raw_dims);
+    return;
+  }
+
   auto out_dims = out->dims();
   auto size = x.numel();
 
@@ -286,7 +296,6 @@ void ScanKernel(const Context& dev_ctx,
     axis += out_dims.size();
   }
 
-  T* out_data = dev_ctx.template Alloc<T>(out);
   const T* in_data = x.data<T>();
 
   // Use thrust for parallel acceleration when the input size is equal to the
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index 0f000af536d8b1b7817afbf5acac141d3de9eec7..85ca46d7e07351c712553a695045e1241afafc34 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/send_u_recv_kernel.h"
 
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index aaae915f9df3e68c5a24f66ca18aaf1bdc4a9913..834a93d629d064f2095eee1041898ade18c7ec1d 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/send_ue_recv_kernel.h"
 
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <algorithm>
 #include <vector>
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index 316fe1fae711637af9cafb7ec0c06693aae7f0a6..d420c8f438b16b8614c89a2c0640bf5c5446f388 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -20,6 +20,7 @@
 #include <thrust/functional.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include <thrust/unique.h>
 
 #include <iostream>
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index d499cdf54abef913ef9d8c2012e354f81e2cb4e6..b9089dad71657930ef2e108606446f0dca4d248e 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/xpu/cum_grad_kernel.cc b/paddle/phi/kernels/xpu/cum_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b11ba47f0a79c708e9662cd158f024188cbb8f3
--- /dev/null
+++ b/paddle/phi/kernels/xpu/cum_grad_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cum_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+
+template <typename T, typename Context>
+void CumsumGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool flatten,
+                      bool exclusive,
+                      bool reverse,
+                      DenseTensor* x_grad) {
+  x_grad->Resize(x.dims());
+  CumsumKernel<T, Context>(
+      dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cumsum_grad, XPU, ALL_LAYOUT, phi::CumsumGradKernel, float, int, int64_t) {}
diff --git a/paddle/phi/kernels/xpu/cum_kernel.cc b/paddle/phi/kernels/xpu/cum_kernel.cc
index 17eca4008607e65457ccae3b813dc43d5e92ac44..13a1dab66d72f28665ce2d27558230a37e457a0a 100644
--- a/paddle/phi/kernels/xpu/cum_kernel.cc
+++ b/paddle/phi/kernels/xpu/cum_kernel.cc
@@ -30,6 +30,15 @@ void CumsumKernel(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
 
+  if (x.numel() == 1) {
+    int r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(x.data<T>()),
+                               reinterpret_cast<XPUType*>(out->data<T>()),
+                               x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    return;
+  }
+
   // prepare for call xdnn api
   std::vector<int> x_shape = phi::vectorize<int>(x.dims());
   int axis_as_int = axis.to<int>();
diff --git a/paddle/phi/kernels/xpu/logical_kernel.cc b/paddle/phi/kernels/xpu/logical_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a0ea242d992670131e596d43ec8e16fde1de0d
--- /dev/null
+++ b/paddle/phi/kernels/xpu/logical_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  int r =
+      xpu::logical_not(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "logical_not");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(logical_not, XPU, ALL_LAYOUT, phi::LogicalNotKernel, bool) {}
diff --git a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
index 52a98c63f48987a348e0df4e15d4469f7402f7c3..8e2f56adfa14147c240949c9d1f483098037cc6b 100644
--- a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
@@ -33,6 +33,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
   auto mask_shape = phi::vectorize<int>(mask.dims());
   auto xshape = phi::vectorize<int>(x_grad->dims());
+  if (mask.dims().size() == 0) {
+    mask_shape = std::vector<int>({1});
+  }
+  if (x_grad->dims().size() == 0) {
+    xshape = std::vector<int>({1});
+  }
 
   int r = xpu::masked_select_grad(dev_ctx.x_context(),
                                   input_data,
diff --git a/paddle/phi/kernels/xpu/masked_select_kernel.cc b/paddle/phi/kernels/xpu/masked_select_kernel.cc
index 0f142e852a9a7fd9f77ff10fc439938b89c7b3c4..c572b5c6e4eb741a5b08edac13c227cfaa479663 100644
--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -61,6 +61,12 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
   auto input_shape = vectorize<int>(input_dim);
   auto mask_shape = vectorize<int>(mask_dim);
+  if (input_dim.size() == 0) {
+    input_shape = std::vector<int>({1});
+  }
+  if (mask_dim.size() == 0) {
+    mask_shape = std::vector<int>({1});
+  }
 
   if (out_size_cpu > 0) {
     PADDLE_ENFORCE_XDNN_SUCCESS(xpu::masked_select(dev_ctx.x_context(),
diff --git a/paddle/phi/ops/compat/size_sig.cc b/paddle/phi/ops/compat/cumsum_sig.cc
similarity index 57%
rename from paddle/phi/ops/compat/size_sig.cc
rename to paddle/phi/ops/compat/cumsum_sig.cc
index 46177e4ae35b991210b52f2024ba3031c26aff4a..00992b15435d2153ccd38d95689ce9e1ee9f31bc 100644
--- a/paddle/phi/ops/compat/size_sig.cc
+++ b/paddle/phi/ops/compat/cumsum_sig.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,7 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/phi/core/compat/op_utils.h"
 
-PD_REGISTER_BASE_KERNEL_NAME(size, numel);
+namespace phi {
+
+KernelSignature CumsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cumsum_grad",
+                         {"X", "Out@GRAD"},
+                         {"axis", "flatten", "exclusive", "reverse"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cumsum_grad, phi::CumsumOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/slogdeterminant_sig.cc b/paddle/phi/ops/compat/feed_sig.cc
similarity index 57%
rename from paddle/phi/ops/compat/slogdeterminant_sig.cc
rename to paddle/phi/ops/compat/feed_sig.cc
index 2e63a90d929085405fd2dbd16647093482094eb9..e28715ce70c63ec06b983a317cbc2eeee6d3346d 100644
--- a/paddle/phi/ops/compat/slogdeterminant_sig.cc
+++ b/paddle/phi/ops/compat/feed_sig.cc
@@ -16,16 +16,17 @@
 
 namespace phi {
 
-KernelSignature SlogDeterminantGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "slogdet_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"});
+KernelSignature FeedOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorOutput("Out")) {
+    return KernelSignature("feed_dense_tensor", {"X"}, {"col"}, {"Out"});
+  } else if (ctx.IsSparseCooTensorOutput("Out")) {
+    return KernelSignature("feed_sparse_coo_tensor", {"X"}, {"col"}, {"Out"});
+  } else {
+    return KernelSignature("feed_strings", {"X"}, {"col"}, {"Out"});
+  }
 }
 
 }  // namespace phi
 
-PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant, slogdet);
-PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant_grad, slogdet_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(slogdeterminant_grad,
-                           phi::SlogDeterminantGradOpArgumentMapping);
+PD_REGISTER_BASE_KERNEL_NAME(feed, feed_dense_tensor);
+PD_REGISTER_ARG_MAPPING_FN(feed, phi::FeedOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
deleted file mode 100644
index ef8eeae358e097b716bd8510ec4d576da7959607..0000000000000000000000000000000000000000
--- a/paddle/phi/ops/compat/graph_send_recv_sig.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature GraphSendRecvOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("Out_size")) {
-    return KernelSignature("send_u_recv",
-                           {"X", "Src_index", "Dst_index"},
-                           {"reduce_op", "Out_size"},
-                           {"Out", "Dst_count"});
-  } else {
-    return KernelSignature("send_u_recv",
-                           {"X", "Src_index", "Dst_index"},
-                           {"reduce_op", "out_size"},
-                           {"Out", "Dst_count"});
-  }
-}
-
-KernelSignature GraphSendRecvGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "send_u_recv_grad",
-      {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"},
-      {"reduce_op"},
-      {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv, send_u_recv);
-PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv_grad, send_u_recv_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(graph_send_recv,
-                           phi::GraphSendRecvOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
-                           phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc
deleted file mode 100644
index aab850831ae334cc1e57f4614b9af7da1b2a014e..0000000000000000000000000000000000000000
--- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature GraphSendUERecvOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("Out_size")) {
-    return KernelSignature("send_ue_recv",
-                           {"X", "Y", "Src_index", "Dst_index"},
-                           {"message_op", "reduce_op", "Out_size"},
-                           {"Out", "Dst_count"});
-  } else {
-    return KernelSignature("send_ue_recv",
-                           {"X", "Y", "Src_index", "Dst_index"},
-                           {"message_op", "reduce_op", "out_size"},
-                           {"Out", "Dst_count"});
-  }
-}
-
-KernelSignature GraphSendUERecvGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "send_ue_recv_grad",
-      {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"},
-      {"message_op", "reduce_op"},
-      {"X@GRAD", "Y@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv, send_ue_recv);
-PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv_grad, send_ue_recv_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv,
-                           phi::GraphSendUERecvOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv_grad,
-                           phi::GraphSendUERecvGradOpArgumentMapping);
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index 415c1f21465edd37ed96285d07ed74b8f46012f1..791167ffe62a6449d7eafa75eb5fd976c909874c 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -70,8 +70,10 @@ TEST(Backend, StringToBackend) {
 #else
   EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
 #endif
-  EXPECT_EQ(static_cast<phi::Backend>(
-                static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
+  EXPECT_EQ(static_cast<Backend>(
+                static_cast<size_t>(Backend::NUM_BACKENDS) +
+                phi::CustomRegisteredDeviceMap::Instance()
+                    .GetOrRegisterGlobalDeviceTypeId("CustomBackend")),
             pexp::StringToBackend("CustomBackend"));
 }
 
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 7f89fb34994fcd8186289cea3a7aee9ad565f08d..c267e60509120be24fa470e835df0f84336c1195 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -94,6 +94,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return false;
   }
 
+  bool IsSparseCooTensorOutput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8449b7158cd2c82b9ad9f89d26a2144bbd7c83e1..4c48154b80a4b21a063af640b122e5b1b296284e 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3461,7 +3461,10 @@ function trt_convert_test() {
 
 function build_pr_and_develop() {
     run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number} 
-    mkdir ${PADDLE_ROOT}/build/python/dist/ && mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
+    if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then
+        mkdir ${PADDLE_ROOT}/build/python/dist/ 
+    fi
+    mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
     generate_api_spec "$1" "PR"
@@ -3483,7 +3486,10 @@ function build_pr_and_develop() {
     else
         git checkout -b develop_base_pr upstream/$BRANCH
         run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number} 
-        mkdir ${PADDLE_ROOT}/build/python/dist/ && mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
+        if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then
+            mkdir ${PADDLE_ROOT}/build/python/dist/ 
+        fi
+        mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
         generate_api_spec "$1" "DEV"
         mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
     fi
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index cef75772283166f052befe0cadccc949a7e276a3..3e78b716faabb1ba5dfaa44a10b808e0e1a9d3eb 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -55,7 +55,7 @@ from .framework.dtype import bool  # noqa: F401
 from .framework.dtype import complex64  # noqa: F401
 from .framework.dtype import complex128  # noqa: F401
 
-if fluid.framework._in_eager_mode_:
+if fluid.framework.global_var._in_eager_mode_:
     Tensor = framework.core.eager.Tensor
 else:
     from .framework import VarBase as Tensor  # noqa: F401
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 64c2423d4395e8708f673fc8fad83b5a645f83b9..ce017ef98540d3165a0e82e469f3bb303eb88b98 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -107,7 +107,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
                     each_tensor, (paddle.Tensor, core.eager.Tensor)
                 ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
     else:
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             grad_tensors = []
         else:
             grad_tensors = [None] * len(tensors)
@@ -119,7 +119,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
 
     assert isinstance(retain_graph, bool), "retain_graph must be True or False"
 
-    if framework._in_eager_mode_:
+    if framework.global_var._in_eager_mode_:
         core.eager.run_backward(tensors, grad_tensors, retain_graph)
     else:
         core.dygraph_run_backward(
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index df975d06a45d4d568f136dcdaf777887066db8de..76401d5c47a9aba70631a88ad05edf4b37db2f79 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -385,12 +385,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
 
 
 def _create_loss_op_desc_(loss):
+    create_shape = [] if len(loss.shape) == 0 else [1]
     op_desc = _create_op_desc_(
         "fill_constant",
         {},
         {"Out": [_append_grad_suffix_(loss.name)]},
         {
-            "shape": [1],
+            "shape": create_shape,
             "value": 1.0,
             "dtype": loss.dtype,
             "force_cpu": False,
@@ -1491,11 +1492,15 @@ def _append_backward_ops_(
     )
 
     # remove some backward ops
-    not_need_ops = _find_not_need_ops(grad_op_descs, ops, input_grad_names_set)
+    # TODO(Jiabin): Support this in prime later, it will prune add_grad, fix this problem
+    if not core.is_prim_enabled():
+        not_need_ops = _find_not_need_ops(
+            grad_op_descs, ops, input_grad_names_set
+        )
 
-    grad_op_descs = [
-        op_desc for op_desc in grad_op_descs if op_desc not in not_need_ops
-    ]
+        grad_op_descs = [
+            op_desc for op_desc in grad_op_descs if op_desc not in not_need_ops
+        ]
 
     # append op_desc in grad_op_descs to target_block
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 299ca1e36614b6189f484818419bca81d60fbf58..771caa4ef3c4fa822dd40d0077b90e778bbbb5d6 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -317,6 +317,7 @@ try:
         from .libpaddle import _array_to_share_memory_tensor
         from .libpaddle import _cleanup_mmap_fds
         from .libpaddle import _remove_tensor_list_mmap_fds
+        from .libpaddle import _set_max_memory_map_allocation_pool_size
 except Exception as e:
     if has_paddle_dy_lib:
         sys.stderr.write(
@@ -371,3 +372,37 @@ def set_paddle_lib_path():
 
 
 set_paddle_lib_path()
+
+
+def set_prim_forward(value):
+    """set flag FLAGS_prim_forward."""
+    flag = str(value)
+    if flag.lower() not in ["true", "false", "debug"]:
+        raise TypeError(f"flag {flag} should be string of bool or 'debug'.")
+    os.environ["FLAGS_prim_forward"] = flag
+    return
+
+
+def enable_prim_forward():
+    flag = os.getenv("FLAGS_prim_forward", "true").lower()
+    if flag == "false":
+        return False
+    if flag == "debug":
+        return "debug"
+    return True
+
+
+def set_prim_backward(value):
+    """set flag FLAGS_prim_backward,"""
+    flag = str(value)
+    if flag.lower() not in ["true", "false"]:
+        raise TypeError(f"flag {flag} should be bool or string of bool.")
+    os.environ["FLAGS_prim_backward"] = flag
+    return
+
+
+def enable_prim_backward():
+    flag = os.getenv("FLAGS_prim_backward", "true")
+    if flag.lower() == "false":
+        return False
+    return True
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index fc1effbd89c7afd704a3b6bb657dc9c11d395312..c7c49c794a1017b1788a489cde6030d4ca374c1b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -20,6 +20,7 @@ import numbers
 import logging
 import itertools
 import threading
+import warnings
 import numpy as np
 from collections import namedtuple
 from paddle.fluid.framework import (
@@ -406,6 +407,20 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
         self._base_seed = np.random.randint(low=0, high=sys.maxsize)
 
+        # Note(zhangbo): shm_buffer_size is used for MemoryMapAllocationPool.
+        # MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in dataloader.
+        # For more details, please see: paddle/fluid/memory/allocation/mmap_allocator.h
+        try:
+            self._worker_shm_buffer_size = (2 + 1) * len(self._dataset[0])
+        except:
+            self._worker_shm_buffer_size = 0
+            warnings.warn(
+                "Setting the shm cache buffer size to 0, equivalent to not using the shm cache policy."
+            )
+        self._main_thread_shm_buffer_size = (
+            (self._worker_shm_buffer_size) * 2 * self._num_workers
+        )
+
         # init workers and indices queues and put 2 indices in each indices queue
         self._init_workers()
         for _ in range(self._outstanding_capacity):
@@ -450,6 +465,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                     self._num_workers,
                     self._use_shared_memory,
                     self._base_seed,
+                    self._worker_shm_buffer_size,
                 ),
             )
             worker.daemon = True
@@ -481,6 +497,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue = core.init_lod_tensor_blocking_queue(
             core.Variable(), self._outstanding_capacity, len(self._places) > 1
         )
+        core._set_max_memory_map_allocation_pool_size(
+            self._main_thread_shm_buffer_size
+        )
         self._reader = core.create_py_reader(
             self._blocking_queue,
             self._var_names,
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index e63a5d5f9344142626cf6f8093e509183b92d732..f486e80d746ea7de44750abea89b1ebece6f62f3 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -275,6 +275,7 @@ def _worker_loop(
     num_workers,
     use_shared_memory,
     base_seed,
+    shm_cahce_size=0,
 ):
     try:
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
@@ -286,6 +287,8 @@ def _worker_loop(
         # set signal handler
         core._set_process_signal_handler()
 
+        core._set_max_memory_map_allocation_pool_size(shm_cahce_size)
+
         # set different numpy seed for each worker
         try:
             import numpy as np
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index d064c5194e031d31ed2bbd77e63c8f86b2f7b517..df500a129787da93ff7e0c687074118cad2c0d31 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -20,6 +20,7 @@ import sys
 import numpy as np
 from paddle.fluid import core
 from paddle.fluid import framework
+from paddle.fluid.framework import global_var
 from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar
 from .tracer import Tracer
 import logging
@@ -44,7 +45,6 @@ __all__ = [
 ]
 
 # Flag that indicates whether running code under `@to_static`
-_in_declarative_mode_ = False
 
 
 def in_declarative_mode():
@@ -52,7 +52,7 @@ def in_declarative_mode():
     Return a bool value that indicates whether running code under `@to_static`
 
     """
-    return _in_declarative_mode_
+    return global_var._in_declarative_mode_
 
 
 def declarative_unsupport_argument_warning(
@@ -86,11 +86,11 @@ switch_to_static_graph = wrap_decorator(_switch_to_static_graph_)
 @signature_safe_contextmanager
 def _switch_declarative_mode_guard_(is_declarative=True):
 
-    global _in_declarative_mode_
-    original_val = _in_declarative_mode_
-    _in_declarative_mode_ = is_declarative
+    global global_var
+    original_val = global_var._in_declarative_mode_
+    global_var._in_declarative_mode_ = is_declarative
     yield
-    _in_declarative_mode_ = original_val
+    global_var._in_declarative_mode_ = original_val
 
 
 @signature_safe_contextmanager
@@ -106,9 +106,6 @@ def program_desc_tracing_guard(enable):
             tracer._enable_program_desc_tracing = original_val
 
 
-_functional_dygraph_context_manager = None
-
-
 @signature_safe_contextmanager
 def param_guard(parameters):
     # Note: parameters is a reference of self._parameters or self._buffers
@@ -228,12 +225,12 @@ def enable_dygraph(place=None):
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
 
     """
-    global _functional_dygraph_context_manager
-    if _functional_dygraph_context_manager is None:
-        _functional_dygraph_context_manager = guard(
+    global global_var
+    if global_var._functional_dygraph_context_manager is None:
+        global_var._functional_dygraph_context_manager = guard(
             place=_get_paddle_place(place)
         )
-        _functional_dygraph_context_manager.__enter__()
+        global_var._functional_dygraph_context_manager.__enter__()
 
         # call disable_dygraph when Python exit
         CleanupFuncRegistrar.register(disable_dygraph)
@@ -263,10 +260,10 @@ def disable_dygraph():
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
 
     """
-    global _functional_dygraph_context_manager
-    if _functional_dygraph_context_manager is not None:
-        _functional_dygraph_context_manager.__exit__(*sys.exc_info())
-        _functional_dygraph_context_manager = None
+    global global_var
+    if global_var._functional_dygraph_context_manager is not None:
+        global_var._functional_dygraph_context_manager.__exit__(*sys.exc_info())
+        global_var._functional_dygraph_context_manager = None
 
 
 @signature_safe_contextmanager
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index cb78b8b9d5932fa3778ed2cd77db7a6dd53f102f..74a174674f64c028a34b7f6a8c66ffbf6712a86e 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -74,7 +74,7 @@ def monkey_patch_math_varbase():
 
     @no_grad
     def create_tensor(value, dtype, shape):
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             out = _C_ops.full(
                 shape, value, dtype, framework._current_expected_place()
             )
@@ -251,7 +251,7 @@ def monkey_patch_math_varbase():
 
             # 2. create varbase for scalar
             lhs_dtype = self.dtype
-            if framework._in_eager_mode_:
+            if framework.global_var._in_eager_mode_:
                 other_var_should_be = core.eager.Tensor
             else:
                 other_var_should_be = core.VarBase
@@ -486,7 +486,7 @@ def monkey_patch_math_varbase():
     global _already_patch_varbase
     global _already_patch_eager_tensor
 
-    if framework._in_eager_mode_:
+    if framework.global_var._in_eager_mode_:
         local_already_patch = _already_patch_eager_tensor
         _already_patch_eager_tensor = True
         local_tensor = core.eager.Tensor
@@ -496,7 +496,7 @@ def monkey_patch_math_varbase():
         local_tensor = core.VarBase
 
     if not local_already_patch:
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             for method_name in eager_cpp_level_patch:
                 method_impl = getattr(local_tensor, method_name, None)
                 if method_impl:
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2aa9a822aa99041c3c226dd8fada6fe5a1a5a59f..9f0d8297f349b4a5b1df64e0983ed29aeb19899d 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -54,7 +54,9 @@ class TensorHookRemoveHelper:
 
     def __init__(self, tensor, hook_id):
         self._tensor = (
-            tensor if framework._in_eager_mode_ else weakref.ref(tensor)
+            tensor
+            if framework.global_var._in_eager_mode_
+            else weakref.ref(tensor)
         )
         self._hook_id = hook_id
 
@@ -65,7 +67,11 @@ class TensorHookRemoveHelper:
         Returns:
             bool: Return True if removed successfully
         """
-        tensor = self._tensor if framework._in_eager_mode_ else self._tensor()
+        tensor = (
+            self._tensor
+            if framework.global_var._in_eager_mode_
+            else self._tensor()
+        )
         if tensor is not None:
             res = tensor._remove_grad_hook(self._hook_id)
             if res is True:
@@ -178,7 +184,7 @@ def monkey_patch_varbase():
                     out = linear(t)  # call with different weight
 
         """
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             base_tensor = core.eager.Tensor
         else:
             base_tensor = core.VarBase
@@ -282,7 +288,7 @@ def monkey_patch_varbase():
                 )
                 record_event.begin()
             if grad_tensor is not None:
-                if framework._in_eager_mode_:
+                if framework.global_var._in_eager_mode_:
                     assert isinstance(
                         grad_tensor, core.eager.Tensor
                     ), "The type of grad_tensor must be paddle.Tensor"
@@ -296,7 +302,7 @@ def monkey_patch_varbase():
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape
                 )
 
-            if framework._in_eager_mode_:
+            if framework.global_var._in_eager_mode_:
                 if grad_tensor is None:
                     grad_tensor = []
                 else:
@@ -311,7 +317,7 @@ def monkey_patch_varbase():
             ):
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                if framework._in_eager_mode_:
+                if framework.global_var._in_eager_mode_:
                     core.eager.run_backward(
                         [scaled_loss], grad_tensor, retain_graph
                     )
@@ -323,7 +329,7 @@ def monkey_patch_varbase():
                         framework._dygraph_tracer(),
                     )
             else:
-                if framework._in_eager_mode_:
+                if framework.global_var._in_eager_mode_:
                     core.eager.run_backward([self], grad_tensor, retain_graph)
                 else:
                     core.dygraph_run_backward(
@@ -368,7 +374,7 @@ def monkey_patch_varbase():
                 # [500.]
 
         """
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             if self.grad is None:
                 return None
             if self.grad.is_selected_rows():
@@ -673,7 +679,7 @@ def monkey_patch_varbase():
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             from paddle.tensor.to_string import tensor_to_string
 
             return tensor_to_string(self)
@@ -707,7 +713,7 @@ def monkey_patch_varbase():
             raise RuntimeError(
                 "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
             )
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             new_varbase = core.eager.Tensor()
         else:
             new_varbase = core.VarBase()
@@ -725,7 +731,7 @@ def monkey_patch_varbase():
         assert (
             numel == 1
         ), "When Variable is used as the condition of if/while , Variable can only contain one element."
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             assert self._is_initialized(), "tensor not initialized"
             return bool(np.all(self.numpy() > 0))
         else:
@@ -850,7 +856,7 @@ def monkey_patch_varbase():
             return _setitem_impl_(self, item, value)
 
         else:
-            if framework._in_eager_mode_:
+            if framework.global_var._in_eager_mode_:
                 return self.__setitem_eager_tensor__(item, value)
             else:
                 # Call c++ func __setitem_varbase__ to speedup.
@@ -1020,7 +1026,7 @@ def monkey_patch_varbase():
     def __hash__(self):
         return hash(id(self))
 
-    if framework._in_eager_mode_ and not hasattr(core, "eager"):
+    if framework.global_var._in_eager_mode_ and not hasattr(core, "eager"):
         return
 
     for method_name, method in (
@@ -1047,12 +1053,12 @@ def monkey_patch_varbase():
         ("to_dense", to_dense),
         ("to_sparse_coo", to_sparse_coo),
     ):
-        if framework._in_eager_mode_:
+        if framework.global_var._in_eager_mode_:
             setattr(core.eager.Tensor, method_name, method)
         else:
             setattr(core.VarBase, method_name, method)
 
-    if framework._in_eager_mode_:
+    if framework.global_var._in_eager_mode_:
         setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
         setattr(core.eager.Tensor, "value", value)
         setattr(core.eager.Tensor, "cpu", cpu)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index d5db9a7f72c0de77c7f8ee6a3467bf6f721bcf01..e3376d8446586607947bf2143fe5c9fe32115dac 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -26,6 +26,7 @@ from .framework import convert_np_dtype_to_dtype_, _apply_pass
 from . import core
 from . import unique_name
 from . import compiler
+from . import set_flags
 from .trainer_factory import TrainerFactory
 from .trainer_factory import FetchHandlerMonitor
 import copy
@@ -510,6 +511,16 @@ def _is_dy2st_enable_standalone_executor():
     ]
 
 
+def _is_cuda_graph_enable_standalone_executor():
+    return framework._cuda_graph_enable_standalone_executor_ in [
+        1,
+        '1',
+        True,
+        'True',
+        'true',
+    ]
+
+
 def _prepare_fleet_executor():
     from ..distributed.fleet.proto import fleet_executor_desc_pb2
 
@@ -844,7 +855,19 @@ class _ExecutorCache:
             )
             build_strategy = compiled_program._build_strategy
             # print(f"Program before convert:\n {inner_program}", flush=True)
+            use_cuda_graph = False
+            # When using cuda graph, the cuda graph preparation logic in PE is not
+            # executed, but it is processed in the constructor of new executor.
+            if (
+                build_strategy is not None
+                and build_strategy.allow_cuda_graph_capture
+            ):
+                use_cuda_graph = True
+                build_strategy.allow_cuda_graph_capture = False
+                set_flags({"FLAGS_new_executor_use_cuda_graph": True})
             compiled_program._compile(scope, place)
+            if use_cuda_graph:
+                build_strategy.allow_cuda_graph_capture = True
             ir_graph = framework.IrGraph(compiled_program._graph)
             converted_program = ir_graph.to_program()
 
@@ -1746,24 +1769,25 @@ class Executor:
                     )
                     return False
 
-                # Unsupported case 4: CUDA Graph
+                # Unsupported case 4: async mode
                 if (
                     compiled_program._build_strategy is not None
-                    and compiled_program._build_strategy.allow_cuda_graph_capture
+                    and compiled_program._build_strategy.async_mode
                 ):
                     warnings.warn(
-                        "Standalone executor is not used for CUDA Graph",
+                        "Standalone executor is not used for async mode",
                         UserWarning,
                     )
                     return False
 
-                # Unsupported case 5: async mode
+                # Unsupported case 5: CUDA Graph
                 if (
                     compiled_program._build_strategy is not None
-                    and compiled_program._build_strategy.async_mode
+                    and compiled_program._build_strategy.allow_cuda_graph_capture
+                    and not _is_cuda_graph_enable_standalone_executor()
                 ):
                     warnings.warn(
-                        "Standalone executor is not used for async mode",
+                        "Standalone executor is not used for CUDA Graph when FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=0",
                         UserWarning,
                     )
                     return False
@@ -1811,8 +1835,13 @@ class Executor:
                 tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
                 # NOTE(dev): `tensor.set(data, self.place)` always call TensorCopySync that is a blocking behavior. So we use `_copy_from` to replace it.
                 cpu_tensor = _as_lodtensor(data, core.CPUPlace())
-                # for ipu, tensor is allocated on cpu
-                if core.is_compiled_with_ipu():
+                if core.is_cuda_graph_capturing():
+                    warnings.warn(
+                        "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not "
+                        "take any effect! Please set the learning rate manually before each batch!"
+                    )
+                elif core.is_compiled_with_ipu():
+                    # for ipu, tensor is allocated on cpu
                     tensor._copy_from(cpu_tensor, tensor._place())
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 41b9b8bbb2deb74882180263e94cb3eb70709dc9..da2fa96c758bae003952b6c2181b35974231c0bd 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -36,6 +36,7 @@ import paddle.version as fluid_version
 import warnings
 import functools
 from .variable_index import _getitem_impl_, _setitem_impl_
+import threading
 
 __all__ = [
     'Program',
@@ -70,8 +71,42 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
+# use thread local to create thread save global variables.
+class GlobalThreadLocal(threading.local):
+    def __init__(self):
+        """
+        init the thread local data.
+        TODO(xiongkun): how to access another thread local data ?
+        """
+        global _dygraph_tracer_
+        self._in_declarative_mode_ = False
+        self._functional_dygraph_context_manager = None
+        self._dygraph_tracer_ = _dygraph_tracer_
+        self._in_eager_mode_ = True
+
+    def __str__(self):
+        strings = []
+        strings.append(
+            "_in_declarative_mode_:" + str(self._in_declarative_mode_)
+        )
+        strings.append(
+            "_functional_dygraph_context_manager:"
+            + str(self._functional_dygraph_context_manager)
+        )
+        strings.append("_dygraph_tracer_:" + str(self._dygraph_tracer_))
+        strings.append("_in_eager_mode_:" + str(self._in_eager_mode_))
+        return "\n".join(strings)
+
+    def __setattr__(self, name, val):
+        if name == '_dygraph_tracer_':
+            global _dygraph_tracer_
+            _dygraph_tracer_ = val
+        self.__dict__[name] = val
+
+
 _dygraph_tracer_ = None
-_in_eager_mode_ = True
+global_var = GlobalThreadLocal()
+
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
@@ -86,6 +121,9 @@ _enable_standalone_executor_ = os.environ.get(
 _dy2st_enable_standalone_executor_ = os.environ.get(
     'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 1
 )
+_cuda_graph_enable_standalone_executor_ = os.environ.get(
+    'FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR', 0
+)
 
 # Some explanation of our execution system 2022.03
 # For now we have 3 kinds of execution system, since we refactored dygraph mode to
@@ -152,20 +190,17 @@ def _switch_tensor_bind_type(is_eager):
 
 
 def _enable_legacy_dygraph():
-    global _in_eager_mode_
-    _in_eager_mode_ = False
+    global_var._in_eager_mode_ = False
     _update_monkey_methods(is_eager=False)
 
 
 def _disable_legacy_dygraph():
-    global _in_eager_mode_
-    _in_eager_mode_ = True
+    global_var._in_eager_mode_ = True
     _update_monkey_methods(is_eager=True)
 
 
 def _in_eager_without_dygraph_check():
-    global _in_eager_mode_
-    return _in_eager_mode_
+    return global_var._in_eager_mode_
 
 
 # FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
@@ -174,7 +209,6 @@ _is_first_import_ = True
 
 
 def _fallback_legacy_dygraph():
-    global _in_eager_mode_
     global _is_first_import_
     need_fallback = False
     # Only enable eager on CPU/GPU/XPU
@@ -184,12 +218,12 @@ def _fallback_legacy_dygraph():
         or core.is_compiled_with_mlu()
     )
 
-    if _in_eager_mode_ and is_not_support:
+    if global_var._in_eager_mode_ and is_not_support:
         # switch into legacy dygraph mode
         warnings.warn(
             "We will fallback into legacy dygraph on NPU/XPU/MLU/IPU/ROCM devices. Because we only support new eager dygraph mode on CPU/GPU currently. "
         )
-        _in_eager_mode_ = False
+        global_var._in_eager_mode_ = False
         if not _is_first_import_:
             _enable_legacy_dygraph()
         need_fallback = True
@@ -231,11 +265,13 @@ def in_dygraph_mode():
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
 
     """
-    return (_dygraph_tracer_ is not None) and _in_eager_mode_
+    return (
+        global_var._dygraph_tracer_ is not None
+    ) and global_var._in_eager_mode_
 
 
 def _non_static_mode():
-    return _dygraph_tracer_ is not None
+    return global_var._dygraph_tracer_ is not None
 
 
 @signature_safe_contextmanager
@@ -600,7 +636,7 @@ non_static_only = wrap_decorator(_non_static_only_)
 
 
 def _dygraph_tracer():
-    return _dygraph_tracer_
+    return global_var._dygraph_tracer_
 
 
 def _global_flags():
@@ -668,9 +704,8 @@ def _current_expected_place():
 
 
 def _set_dygraph_tracer_expected_place(place):
-    global _dygraph_tracer_
-    if _dygraph_tracer_ is not None:
-        _dygraph_tracer_._expected_place = place
+    if global_var._dygraph_tracer_ is not None:
+        global_var._dygraph_tracer_._expected_place = place
 
 
 def _set_expected_place(place):
@@ -1312,7 +1347,7 @@ def _varbase_creator(
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if _in_eager_mode_:
+    if global_var._in_eager_mode_:
         eager_tensor = core.eager.Tensor(
             dtype if dtype else core.VarDesc.VarType.FP32,
             list(shape) if shape else [],
@@ -7457,16 +7492,17 @@ def _get_var(name, program=None):
 
 @signature_safe_contextmanager
 def _dygraph_guard(tracer):
-    global _dygraph_tracer_
-    tmp_tracer = _dygraph_tracer_
-    _dygraph_tracer_ = tracer
-    core._switch_tracer(tracer)
+    tmp_tracer = global_var._dygraph_tracer_
+    global_var._dygraph_tracer_ = tracer
+    if tracer is not None:
+        core._switch_tracer(tracer)
 
     try:
         yield
     finally:
-        core._switch_tracer(tmp_tracer)
-        _dygraph_tracer_ = tmp_tracer
+        if tmp_tracer is not None:
+            core._switch_tracer(tmp_tracer)
+        global_var._dygraph_tracer_ = tmp_tracer
 
 
 @signature_safe_contextmanager
diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/fluid/lazy_init.py
index 54755c0787947feb040b95c008d141c3f64452b4..1851056f2c2ce4d08adf6ce255231278e531ebce 100644
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
@@ -59,8 +59,8 @@ class LazyInitHelper:
         self.enable()
         if self._in_guard:
             return
-        self._tracer = framework._dygraph_tracer_
-        framework._dygraph_tracer_ = None
+        self._tracer = framework.global_var._dygraph_tracer_
+        framework.global_var._dygraph_tracer_ = None
         self._in_guard = True
 
     def __exit__(self, *args, **kwargs):
@@ -71,7 +71,7 @@ class LazyInitHelper:
         if not self._in_guard:
             return
         assert self._tracer is not None
-        framework._dygraph_tracer_ = self._tracer
+        framework.global_var._dygraph_tracer_ = self._tracer
         self._tracer = None
         self._in_guard = False
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 8d8046f19aa79f83ccb1f8757a3099b03b7ec61d..833993c621612f28a3acc895b544e9cc084b4486 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,7 +21,6 @@ import numpy as np
 
 import paddle
 import paddle.static as static
-from paddle import fluid
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.vision.transforms import Compose, Normalize
 
@@ -146,8 +145,10 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     paddle.set_device(device)
 
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    t.retain_grads()
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.retain_grads()
     dx = paddle.grad(
         outputs=out,
         inputs=t,
@@ -259,7 +260,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                     )
 
     def test_dynamic(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -286,7 +286,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                             x_grad, pd_x_grad
                         ),
                     )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
@@ -354,7 +353,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         paddle.disable_static()
 
     def test_double_grad_dynamic(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -380,7 +378,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                         dx_grad, pd_dx_grad
                     ),
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_with_dataloader(self):
         for device in self.devices:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
index 7b251e8063a05e7d4a09238feaf1efef04739fe4..ef0f52d5c3f2dafbc6a480aa2c1497c87b793666 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
@@ -30,8 +30,10 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
 
     t = paddle.to_tensor(np_x, dtype=dtype)
     t.stop_gradient = False
+    t.retain_grads()
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.retain_grads()
     out.stop_gradient = False
 
     out.backward()
@@ -142,14 +144,14 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
 
 
 def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
-    import paddle.fluid as fluid
 
-    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
     paddle.set_device(device)
 
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    t.retain_grads()
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.retain_grads()
     dx = paddle.grad(
         outputs=out,
         inputs=t,
@@ -164,7 +166,6 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
         grad_outputs=paddle.ones_like(t),
         create_graph=False,
     )
-    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     assert ddout[0].numpy() is not None
     return dx[0].numpy(), ddout[0].numpy()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
index bedaf36832f91ee600ea7f789ea0ff6b73366a78..ad139e84a90c4dedd49d28567ed9cb83f62ff7b5 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -19,7 +19,6 @@ import numpy as np
 from utils import extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
-import paddle.fluid as fluid
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -41,24 +40,25 @@ custom_ops = load(
 
 
 def custom_tanh_double_grad_dynamic(func, device, dtype, np_x):
-    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
     paddle.set_device(device)
 
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    t.retain_grads()
 
     out = func(t)
     out.stop_gradient = False
+    out.retain_grads()
 
     dx = paddle.grad(
         outputs=[out], inputs=[t], create_graph=True, retain_graph=True
     )
 
+    dx[0].retain_grads()
     dx[0].backward()
 
     assert out.grad is not None
     assert dx[0].grad is not None
     return dx[0].numpy(), dx[0].grad.numpy(), out.grad.numpy()
-    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestCustomTanhDoubleGradJit(unittest.TestCase):
@@ -68,7 +68,6 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase):
         self.devices = ['cpu']
 
     def test_double_grad_dynamic(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         for device in self.devices:
             for dtype in self.dtypes:
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -102,7 +101,6 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase):
                         dout, pd_dout
                     ),
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py
index 969fdb2f8a6b2229d77c805fcf977ee6dbcc926a..b347ee139728abf9029a34af605d966285301d27 100644
--- a/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py
@@ -24,11 +24,11 @@ import numpy as np
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
     import paddle
 
-    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
     paddle.set_device(device)
 
     t = paddle.to_tensor(np_x, dtype=dtype)
     t.stop_gradient = False
+    t.retain_grads()
     sys.stdout.flush()
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
@@ -36,7 +36,6 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
 
     out.backward()
 
-    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
     if t.grad is None:
         return out.numpy(), t.grad
     else:
@@ -105,11 +104,12 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     import paddle
 
     paddle.set_device(device)
-    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    t.retain_grads()
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.retain_grads()
     dx = paddle.grad(
         outputs=out,
         inputs=t,
@@ -125,7 +125,6 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
         create_graph=False,
     )
 
-    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
     assert ddout[0].numpy() is not None
     return dx[0].numpy(), ddout[0].numpy()
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6d99deb2bfc58019de279f3b41a8c910a1aab5a1..2eea2070befe39a230c4ac77306983abe0be4ca9 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1259,3 +1259,7 @@ set_tests_properties(test_parallel_executor_dry_run
                      PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=0")
 set_tests_properties(test_parallel_executor_drop_scope
                      PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=0")
+
+set_tests_properties(
+  test_cuda_graph_static_mode
+  PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1")
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
index dea1d1ee2d9e1f71a175d664864aac4493eefb51..13ebddbd786da06a5b7930b533db864f69d4b937 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
@@ -30,8 +30,6 @@ class TestCollectiveAllToAllSingle(unittest.TestCase):
             paddle.distributed.is_initialized()
         ), "The distributed environment has been initialized."
 
-        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
     def test_collective_alltoall_single(self):
         rank = dist.get_rank()
         size = dist.get_world_size()
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
index ec3b2ad5e4a4b622c111e857dcda5ce73e97d574..13dbd974a1bca81746fbd2e19b7950740409a0c2 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
@@ -23,7 +23,6 @@ import paddle.distributed as dist
 class TestCollectiveBatchIsendIrecv(unittest.TestCase):
     def setUp(self):
         dist.init_parallel_env()
-        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     def test_collective_batch_isend_irecv(self):
         rank = dist.get_rank()
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
index c2cf243ee02cd7edda79a14a8c578fa825a34790..7017237cacd1e5be5f4e304270c5521233d73962 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
@@ -24,7 +24,6 @@ from paddle.distributed.communication.reduce_scatter import _reduce_scatter_base
 class TestCollectiveReduceScatter(unittest.TestCase):
     def setUp(self):
         dist.init_parallel_env()
-        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     def test_collective_reduce_scatter_sum(self):
         rank = dist.get_rank()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
index 78e119fa8fb424693e5cc28d16170a535af4541b..97dd4e39395914d83ac559f3335967fb25743395 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
@@ -34,7 +34,6 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
-        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     def test_parallel_margin_softmax_cross_entropy(self):
         margin1s = [1.0, 1.0, 1.35]
@@ -93,6 +92,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
                         norm_weight = paddle.divide(weight, weight_l2)
 
                         data = paddle.matmul(norm_input, norm_weight)
+                        data.retain_grads()
                         data.stop_gradient = False
 
                         sta = (
@@ -118,6 +118,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
                             group=check_group,
                         )
                         integral_data = integral_data.detach().clone()
+                        integral_data.retain_grads()
                         integral_data.stop_gradient = False
 
                         # add arcface margin to logit
diff --git a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py
index 2ac671962c0389a97e32e15851c12e2478d04ea9..c7c876b8f8fea6c79bfdf91c95e4dc16d7cefb93 100644
--- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py
@@ -57,8 +57,7 @@ attrs = Attr()
 
 
 def fn(x):
-    y = paddle.tan(x)
-    return F.softmax(y, axis=attrs.axis, dtype=attrs.dtype)
+    return F.softmax(x, axis=attrs.axis, dtype=attrs.dtype)
 
 
 def expect_forward(inputs):
@@ -81,8 +80,17 @@ class TestCompositeSoftmax(unittest.TestCase):
             )
             y = fn(x)
             blocks = main_program.blocks
+
+            fwd_ops = [op.type for op in blocks[0].ops]
+            # Ensure that softmax in original block
+            self.assertTrue('softmax' in fwd_ops)
+
             paddle.incubate.autograd.to_prim(blocks)
 
+            fwd_ops_new = [op.type for op in blocks[0].ops]
+            # Ensure that softmax is splitted into small ops
+            self.assertTrue('softmax' not in fwd_ops_new)
+
         exe = paddle.static.Executor()
         exe.run(startup_program)
         res = exe.run(main_program, feed={'x': inputs}, fetch_list=[y])
@@ -97,7 +105,7 @@ class TestCompositeSoftmax(unittest.TestCase):
         actual = self.cal_composite(np_data)[0]
 
         assert expect.dtype == actual.dtype
-        assert np.allclose(
+        np.testing.assert_allclose(
             expect,
             actual,
             rtol=attrs.get_rtol("forward"),
diff --git a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py
index c47399ba5a983034bc5c1ac91cd7b57577f0ef18..808c5f8324b65a87efa5c46005c553f5f58703fb 100644
--- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py
@@ -19,6 +19,7 @@ from utils import TOLERANCE
 
 import paddle
 import paddle.nn.functional as F
+from paddle.fluid import core
 
 
 def generate_data(shape, dtype="float32"):
@@ -57,11 +58,11 @@ attrs = Attr()
 
 
 def fn(x):
-    y = paddle.tan(x)
-    return F.softmax(y, axis=attrs.axis, dtype=attrs.dtype)
+    return F.softmax(x, axis=attrs.axis, dtype=attrs.dtype)
 
 
 def expect_grad(inputs):
+    paddle.disable_static()
     inputs.stop_gradient = False
     res = fn(inputs)
 
@@ -86,8 +87,22 @@ class TestCompositeSoftmax(unittest.TestCase):
             x.stop_gradient = False
             y = fn(x)
             blocks = main_program.blocks
+
+            fwd_ops = [op.type for op in blocks[0].ops]
+            # Ensure that softmax in original block
+            self.assertTrue('softmax' in fwd_ops)
+
             paddle.incubate.autograd.to_prim(blocks)
+
+            fwd_ops_new = [op.type for op in blocks[0].ops]
+            # Ensure that softmax is splitted into small ops
+            self.assertTrue('softmax' not in fwd_ops_new)
+
             z = paddle.static.gradients([y], x)
+            fwd_ops_grad = [op.type for op in blocks[0].ops]
+            # Ensure that softmax_grad not in grad block
+
+            self.assertTrue('softmax_grad' not in fwd_ops_grad)
 
         exe = paddle.static.Executor()
         exe.run(startup_program)
@@ -103,7 +118,7 @@ class TestCompositeSoftmax(unittest.TestCase):
         actual = self.cal_composite_grad(np_data)[0]
 
         assert expect.dtype == actual.dtype
-        assert np.allclose(
+        np.testing.assert_allclose(
             expect,
             actual,
             rtol=attrs.get_rtol("backward"),
@@ -120,5 +135,59 @@ class TestCompositeSoftmax(unittest.TestCase):
                     self.compare_backward()
 
 
+class TestCompositeSoftmaxPrimBackward(unittest.TestCase):
+    "test composite softmax and prim backward"
+
+    def setUp(self):
+        core.set_prim_enabled(True)
+        self.dtypes = ["float32"]
+        self.shapes = [[2, 3, 4], [2, 3]]
+        self.axes = [-1, 0, 1]
+
+    def cal_composite_grad(self, inputs):
+        paddle.enable_static()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                'x', shape=inputs.shape, dtype=str(inputs.dtype)
+            )
+            x.stop_gradient = False
+            y = fn(x)
+            blocks = main_program.blocks
+            paddle.incubate.autograd.to_prim(blocks)
+            z = paddle.static.gradients([y], x)
+
+        exe = paddle.static.Executor()
+        exe.run(startup_program)
+        res = exe.run(main_program, feed={'x': inputs}, fetch_list=[z])
+        paddle.disable_static()
+        return res
+
+    def compare_backward(self):
+        np_data = generate_data(attrs.shape)
+        tensor_data = paddle.to_tensor(np_data)
+
+        expect = expect_grad(tensor_data)[0].numpy()
+        actual = self.cal_composite_grad(np_data)[0]
+
+        assert expect.dtype == actual.dtype
+        np.testing.assert_allclose(
+            expect,
+            actual,
+            rtol=attrs.get_rtol("prim_backward"),
+            atol=attrs.get_rtol("prim_backward"),
+        )
+
+    def test_prim_backward(self):
+        for i in self.axes:
+            for j in self.dtypes:
+                for t in self.shapes:
+                    attrs.set_axis(i)
+                    attrs.set_dtype(j)
+                    attrs.set_shape(t)
+                    self.compare_backward()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/composite_ops/utils.py b/python/paddle/fluid/tests/unittests/composite_ops/utils.py
index c43f79d1c053f6b4ce365dd751f3dd034737f836..798da50a1c4367cd30ec58b9b73be1647578e07d 100644
--- a/python/paddle/fluid/tests/unittests/composite_ops/utils.py
+++ b/python/paddle/fluid/tests/unittests/composite_ops/utils.py
@@ -15,11 +15,13 @@
 
 TOLERANCE = {
     "float32": {
-        "forward": {"rtol": 1e-6, "atol": 1e-6},
-        "backward": {"rtol": 1e-6, "atol": 1e-6},
-    },
-    "float64": {
         "forward": {"rtol": 1e-7, "atol": 1e-7},
         "backward": {"rtol": 1e-7, "atol": 1e-7},
+        "prim_backward": {"rtol": 1e-6, "atol": 1e-6},
+    },
+    "float64": {
+        "forward": {"rtol": 1e-16, "atol": 1e-16},
+        "backward": {"rtol": 1e-15, "atol": 1e-15},
+        "prim_backward": {"rtol": 1e-15, "atol": 1e-15},
     },
 }
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1924d84db5898cf68f8aea899f1b74094ff1b67
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        out = x + 1
+        return out
+
+
+class TestBackwardWithoutParams(unittest.TestCase):
+    def test_run(self):
+        net = Net()
+
+        x = paddle.ones([2, 2])
+        x.stop_gradient = False
+        out = net(x)
+        loss = paddle.mean(out)
+        loss.backward()
+        np.testing.assert_equal(x.grad.numpy(), np.full(x.shape, 0.25))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
index 499f7285f29aad5eb25435b4ad9401b05f392bbf..03f89bb84fc4293eba5d7b1d9a83f6d1e323cc4a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
@@ -36,7 +36,7 @@ class TestDy2staticException(unittest.TestCase):
             with self.assertRaisesRegex(Dygraph2StaticException, self.error):
                 paddle.jit.enable_to_static(True)
                 self.assertTrue(to_static(self.dyfunc)(self.x))
-        paddle.fluid.dygraph.base._in_declarative_mode_ = False
+        paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False
         paddle.jit.enable_to_static(False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py
new file mode 100644
index 0000000000000000000000000000000000000000..2811a348f46561423449eb9f646e750c7935e3cc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import core
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x):
+        y = self.fc(x)
+        out = F.softmax(y)
+        return out
+
+
+class TestPrimForward(unittest.TestCase):
+    """
+    This case only tests prim_forward + to_static + cinn. Thus we need to
+    set this flag as False to avoid prim_backward.
+    core.set_prim_backward(False)
+    """
+
+    def setUp(self):
+        core.set_prim_backward(False)
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+
+    def train(self, use_prim):
+        paddle.seed(2022)
+        net = PrimeNet()
+        sgd = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=net.parameters()
+        )
+        if use_prim:
+            net = apply_to_static(net, use_prim)
+
+        res = []
+        for _ in range(10):
+            out = net(self.x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+            res.append(out.numpy())
+
+        self.check_prim(net, use_prim)
+
+        return res
+
+    def check_prim(self, net, use_prim):
+        if not use_prim:
+            return
+        fwd_ops = [op.type for op in net.forward.main_program.block(0).ops]
+        # Ensure that softmax is splitted into small ops
+        self.assertTrue('softmax' not in fwd_ops)
+
+    def test_cinn_prim_forward(self):
+        dy_res = self.train(use_prim=False)
+        cinn_res = self.train(use_prim=True)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                cinn_res[i], dy_res[i], rtol=1e-7, atol=1e-7
+            )
+
+
+class TestPrimForwardAndBackward(unittest.TestCase):
+    """
+    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+
+    def train(self, use_prim):
+        core.set_prim_backward(True)
+        paddle.seed(2022)
+        net = PrimeNet()
+        sgd = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=net.parameters()
+        )
+        if use_prim:
+            net = apply_to_static(net, use_prim)
+
+        res = []
+        for _ in range(10):
+            out = net(self.x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+            res.append(out.numpy())
+
+        self.check_prim(net, use_prim)
+
+        return res
+
+    def check_prim(self, net, use_prim):
+        if not use_prim:
+            return
+        fwd_ops = [op.type for op in net.forward.main_program.block(0).ops]
+        # Ensure that softmax is splitted into small ops
+        self.assertTrue('softmax' not in fwd_ops)
+
+    def test_cinn_prim(self):
+        plat = platform.system()
+        if plat == "Linux":
+            dy_res = self.train(use_prim=False)
+            cinn_res = self.train(use_prim=True)
+
+            for i in range(len(dy_res)):
+                np.testing.assert_allclose(
+                    cinn_res[i], dy_res[i], rtol=1e-6, atol=1e-6
+                )
+        else:
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index daf6f0a9aca688e81834eb6e2367b7c7e127ba17..42704cfe289b0ff73db1ae43ae684feee655d6e4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -377,6 +377,7 @@ class TestTransform(TestTransformBase):
         if not isinstance(dy_outs, (tuple, list)):
             dy_outs = (dy_outs,)
 
+        self.dygraph_func.eval()
         st_outs = self.get_static_output()
         if not isinstance(st_outs, (tuple, list)):
             st_outs = (st_outs,)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 72ed077af3339ec3d8088677aab6f74553d2cf98..d400f15285f8006c31f013d77b8f58cf861942c7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -65,7 +65,7 @@ class TestDy2staticException(unittest.TestCase):
             with self.assertRaisesRegex(Dygraph2StaticException, self.error):
                 paddle.jit.enable_to_static(True)
                 self.assertTrue(paddle.jit.to_static(self.dyfunc)(self.x))
-        paddle.fluid.dygraph.base._in_declarative_mode_ = False
+        paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False
         paddle.jit.enable_to_static(False)
 
 
@@ -463,7 +463,7 @@ class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1):
         # that the code block is under @to_static, but in this UT
         # an exception is thrown during Dy2St, making the `_in_declarative_mode_`
         # a wrong value. So We need set `_in_declarative_mode_` to False manually.
-        paddle.fluid.dygraph.base._in_declarative_mode_ = False
+        paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False
         paddle.jit.enable_to_static(False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 40919edbce6d4212f5ddc4dac1710c9f480bc737..b195c7d342a724598bfa175c60442d3bca418048 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -14,6 +14,7 @@
 
 import math
 import os
+import platform
 import tempfile
 import time
 import unittest
@@ -450,5 +451,67 @@ class TestResnet(unittest.TestCase):
             fluid.set_flags({'FLAGS_use_mkldnn': False})
 
 
+class TestResnetPrim(unittest.TestCase):
+    "test prim forward +  prim backward + to_static"
+
+    def setUp(self):
+        self.resnet_helper = ResNetHelper()
+
+    def train(self, to_static):
+        paddle.jit.enable_to_static(to_static)
+        return self.resnet_helper.train(to_static)
+
+    def verify_predict(self):
+        image = np.random.random([1, 3, 224, 224]).astype('float32')
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
+        np.testing.assert_allclose(
+            dy_pre,
+            st_pre,
+            rtol=1e-05,
+            err_msg='dy_pre:\n {}\n, st_pre: \n{}.'.format(dy_pre, st_pre),
+        )
+        np.testing.assert_allclose(
+            dy_jit_pre,
+            st_pre,
+            rtol=1e-05,
+            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
+                dy_jit_pre, st_pre
+            ),
+        )
+        np.testing.assert_allclose(
+            predictor_pre,
+            st_pre,
+            rtol=1e-05,
+            err_msg='predictor_pre:\n {}\n, st_pre: \n{}.'.format(
+                predictor_pre, st_pre
+            ),
+        )
+
+    def test_resnet_composite(self):
+        plat = platform.system()
+        if plat == "Linux":
+            print("=================== origin resnet ===================")
+            core.set_prim_enabled(False)
+            static_loss = self.train(to_static=True)
+            print("======= resnet with prim forward and backward =======")
+            core.set_prim_enabled(True)
+            core.set_prim_forward("debug")
+            dygraph_loss = self.train(to_static=True)
+            np.testing.assert_allclose(
+                static_loss,
+                dygraph_loss,
+                rtol=1e-02,
+                err_msg='static_loss: {} \n dygraph_loss: {}'.format(
+                    static_loss, dygraph_loss
+                ),
+            )
+            core.set_prim_enabled(False)
+        else:
+            pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index d5541666e9e99763447484d059fade5e72c59149..d456a86aa9d28cdbded3da5b40be6036cf5f9777 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -232,7 +232,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass
+    set_tests_properties(test_onednn_reshape_transpose_matmul_fuse_pass
                          PROPERTIES TIMEOUT 100)
     set_tests_properties(test_mkldnn_mish_op PROPERTIES TIMEOUT 300)
     set_tests_properties(test_mkldnn_conv3d_op PROPERTIES TIMEOUT 300)
@@ -240,7 +240,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
     set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass
                          PROPERTIES TIMEOUT 250)
-    set_tests_properties(test_mkldnn_matmul_transpose_reshape_fuse_pass
+    set_tests_properties(test_onednn_matmul_transpose_reshape_fuse_pass
                          PROPERTIES TIMEOUT 100)
     set_tests_properties(test_conv_transpose_bn_fuse_pass PROPERTIES TIMEOUT
                                                                      300)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
similarity index 70%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
index a2d2260683020777431c572d5ed8104a3991afce..85cdfd314a7cf456fc938b8d602fa748490080fc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
@@ -21,7 +21,7 @@ from auto_scan_test import PassAutoScanTest
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
+class TestOneDNNMatmulTransposeReshapeFusePass(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
@@ -57,42 +57,42 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
                 shape_x = [batch_size, channel, 32, input_dim]
                 shape_y = [batch_size, channel, input_dim, 16]
 
-            if type == "x":
+            if type == 'x':
                 return np.random.random(shape_x).astype(np.float32)
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
         matmul_op = OpConfig(
-            type="matmul",
-            inputs={"X": ["input_data1"], "Y": ["input_data2"]},
-            outputs={"Out": ["matmul_output"]},
+            type='matmul',
+            inputs={'X': ['input_data1'], 'Y': ['input_data2']},
+            outputs={'Out': ['matmul_output']},
             attrs={
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": [],
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+                'alpha': alpha,
+                'fused_reshape_X': [],
+                'fused_reshape_Y': [],
+                'fused_transpose_X': [],
+                'fused_transpose_Y': [],
+                'fused_reshape_Out': [],
+                'fused_transpose_Out': [],
             },
         )
 
         transpose2_op = OpConfig(
-            type="transpose2",
-            inputs={"X": ["matmul_output"]},
+            type='transpose2',
+            inputs={'X': ['matmul_output']},
             outputs={
-                "Out": ["transpose2_output"],
-                "XShape": ["transpose2_xshape"],
+                'Out': ['transpose2_output'],
+                'XShape': ['transpose2_xshape'],
             },
             attrs={'axis': axis},
         )
 
         reshape2_op = OpConfig(
-            type="reshape2",
-            inputs={"X": ["transpose2_output"]},
-            outputs={"Out": ["reshape2_output"], "XShape": ["reshape2_xshape"]},
+            type='reshape2',
+            inputs={'X': ['transpose2_output']},
+            outputs={'Out': ['reshape2_output'], 'XShape': ['reshape2_xshape']},
             attrs={'shape': shape},
         )
 
@@ -102,27 +102,27 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
             ops=model_net,
             weights={},
             inputs={
-                "input_data1": TensorConfig(
-                    data_gen=partial(generate_input, "x")
+                'input_data1': TensorConfig(
+                    data_gen=partial(generate_input, 'x')
                 ),
-                "input_data2": TensorConfig(
-                    data_gen=partial(generate_input, "y")
+                'input_data2': TensorConfig(
+                    data_gen=partial(generate_input, 'y')
                 ),
             },
-            outputs=["reshape2_output"],
+            outputs=['reshape2_output'],
         )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ["matmul"], (1e-5, 1e-5)
+        yield config, ['matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["matmul_transpose_reshape_mkldnn_fuse_pass"]
+            quant=False, passes=['matmul_transpose_reshape_mkldnn_fuse_pass']
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
similarity index 70%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
index fc4d80060756c75e7b15131e8147047a138cd24d..2f9051fe16b5c34546a1eb35e9b85ab725918d8c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -20,10 +20,11 @@ import numpy as np
 from auto_scan_test import PassAutoScanTest
 from program_config import ProgramConfig, TensorConfig
 
-num = 32 * 64
 
+class TestOneDNNReshapeTransposeMatmulFusePass(PassAutoScanTest):
+    def setUp(self):
+        self.num = 32 * 64
 
-class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -40,11 +41,11 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
         input_dim = draw(st.sampled_from([32, 64]))
 
         def generate_input1(attrs):
-            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num]
+            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], self.num]
             return np.random.random(shape_x).astype(np.float32)
 
         def generate_input2(attrs):
-            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num]
+            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], self.num]
             input_volume = reduce(lambda x, y: x * y, shape_x)
             matmul_shape = [i for i in attrs[0]['shape']]
             if 0 in matmul_shape:
@@ -66,7 +67,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
                     matmul_shape[0],
                     matmul_shape[1],
                     matmul_shape[-1],
-                    int(num / matmul_shape[-1]),
+                    int(self.num / matmul_shape[-1]),
                 ]
             elif attrs[2]['transpose_X']:
                 shape_y = matmul_shape
@@ -77,17 +78,17 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
                     matmul_shape[0],
                     matmul_shape[1],
                     matmul_shape[-1],
-                    int(num / matmul_shape[-1]),
+                    int(self.num / matmul_shape[-1]),
                 ]
             return np.random.random(shape_y).astype(np.float32)
 
         attrs = [
-            {"shape": shape},
-            {"axis": axis},
+            {'shape': shape},
+            {'axis': axis},
             {
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+                'alpha': alpha,
             },
             {
                 'batch_size': batch_size,
@@ -98,37 +99,37 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
 
         ops_config = [
             {
-                "op_type": "reshape2",
-                "op_inputs": {"X": ["input_data1"]},
-                "op_outputs": {
-                    "Out": ["reshape2_output"],
-                    "XShape": ["reshape2_xshape"],
+                'op_type': 'reshape2',
+                'op_inputs': {'X': ['input_data1']},
+                'op_outputs': {
+                    'Out': ['reshape2_output'],
+                    'XShape': ['reshape2_xshape'],
                 },
-                "op_attrs": {'shape': attrs[0]['shape']},
+                'op_attrs': {'shape': attrs[0]['shape']},
             },
             {
-                "op_type": "transpose2",
-                "op_inputs": {"X": ["reshape2_output"]},
-                "op_outputs": {
-                    "Out": ["transpose2_output"],
-                    "XShape": ["transpose2_xshape"],
+                'op_type': 'transpose2',
+                'op_inputs': {'X': ['reshape2_output']},
+                'op_outputs': {
+                    'Out': ['transpose2_output'],
+                    'XShape': ['transpose2_xshape'],
                 },
-                "op_attrs": {'axis': attrs[1]['axis']},
+                'op_attrs': {'axis': attrs[1]['axis']},
             },
             {
-                "op_type": "matmul",
-                "op_inputs": {"X": ["transpose2_output"], "Y": ["input_data2"]},
-                "op_outputs": {"Out": ["matmul_output"]},
-                "op_attrs": {
+                'op_type': 'matmul',
+                'op_inputs': {'X': ['transpose2_output'], 'Y': ['input_data2']},
+                'op_outputs': {'Out': ['matmul_output']},
+                'op_attrs': {
                     'transpose_X': attrs[2]['transpose_X'],
                     'transpose_Y': attrs[2]['transpose_Y'],
                     'alpha': attrs[2]['alpha'],
-                    "fused_reshape_X": [],
-                    "fused_reshape_Y": [],
-                    "fused_transpose_X": [],
-                    "fused_transpose_Y": [],
-                    "fused_reshape_Out": [],
-                    "fused_transpose_Out": [],
+                    'fused_reshape_X': [],
+                    'fused_reshape_Y': [],
+                    'fused_transpose_X': [],
+                    'fused_transpose_Y': [],
+                    'fused_reshape_Out': [],
+                    'fused_transpose_Out': [],
                 },
             },
         ]
@@ -139,27 +140,27 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
             ops=ops,
             weights={},
             inputs={
-                "input_data1": TensorConfig(
+                'input_data1': TensorConfig(
                     data_gen=partial(generate_input1, attrs)
                 ),
-                "input_data2": TensorConfig(
+                'input_data2': TensorConfig(
                     data_gen=partial(generate_input2, attrs)
                 ),
             },
-            outputs=["matmul_output"],
+            outputs=['matmul_output'],
         )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ["matmul"], (1e-5, 1e-5)
+        yield config, ['matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["reshape_transpose_matmul_mkldnn_fuse_pass"]
+            quant=False, passes=['reshape_transpose_matmul_mkldnn_fuse_pass']
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
index 94faec4d530a63b874e53c16904699787357b067..69769bbdc1f08e36b0b82a931f15f277c667c209 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
@@ -25,7 +25,7 @@ from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid as fluid
 from paddle.fluid import core, framework, executor
 from paddle.fluid.layers.utils import _hash_with_id
-from paddle.fluid.framework import _in_eager_mode_
+from paddle.fluid.framework import global_var
 
 paddle.enable_static()
 np.random.seed(1243)
@@ -135,7 +135,7 @@ class RunProgramNPUOpTest(unittest.TestCase):
 
     def prepare_dygraph_input(self, place, return_param_list=False):
         def create_var_base(is_input, name, np_value, stop_gradient):
-            if _in_eager_mode_:
+            if global_var._in_eager_mode_:
                 var = core.eager.Tensor(
                     value=np_value, name=name, place=place, zero_copy=True
                 )
@@ -176,7 +176,7 @@ class RunProgramNPUOpTest(unittest.TestCase):
         for name in self.output_names['Out']:
             outputs['Out'].append(create_var_base(False, name))
 
-        if _in_eager_mode_:
+        if global_var._in_eager_mode_:
             outputs['OutScope'] = [core.Scope()]
         else:
             outputs['OutScope'] = framework._varbase_creator(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt
index 7d5fc1006d1e8ef175256a2a196482c199b98c48..863a484c466f189d9ae31f4f3a0c9b7cb84373ec 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt
@@ -8,10 +8,3 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach()
-
-set_tests_properties(test_comp_eager_tanh_grad PROPERTIES TIMEOUT 60)
-set_tests_properties(test_comp_eager_div_grad PROPERTIES TIMEOUT 60)
-set_tests_properties(test_comp_eager_sum_grad PROPERTIES TIMEOUT 60)
-set_tests_properties(test_comp_eager_add_grad PROPERTIES TIMEOUT 60)
-set_tests_properties(test_comp_eager_sub_grad PROPERTIES TIMEOUT 60)
-set_tests_properties(test_comp_eager_sqrt_grad PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81314ba041ef7f91f0bdb4f1c266d4bcc92bb72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import autograd
+import autograd.numpy
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core
+
+
+@param.parameterized_class(
+    ('primal', 'cotangent', 'dtype'),
+    [
+        (np.random.rand(10, 10), np.random.rand(10, 10), np.float32),
+    ],
+)
+class TestExpGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        core.set_prim_enabled(True)
+        cls.primal = cls.primal.astype(cls.dtype)
+        if cls.cotangent is not None:
+            cls.cotangent = cls.cotangent.astype(cls.dtype)
+
+    @classmethod
+    def tearDownClass(cls):
+        core.set_prim_enabled(False)
+
+    def test_exp_grad_comp(self):
+        def actual(primal, cotangent):
+            primal = paddle.to_tensor(primal)
+            primal.stop_gradient = False
+            return paddle.grad(
+                paddle.exp(primal), primal, paddle.to_tensor(cotangent)
+            )[0]
+
+        def desired(primal, cotangent):
+            cotangent = (
+                np.ones_like(cotangent, dtype=primal.dtype)
+                if cotangent is None
+                else cotangent
+            )
+            return autograd.make_vjp(autograd.numpy.exp)(primal)[0](cotangent)
+
+        np.testing.assert_allclose(
+            actual=actual(self.primal, self.cotangent),
+            desired=desired(self.primal, self.cotangent),
+            rtol=1e-6,
+            atol=0,
+        )
+
+    def test_stop_gradients(self):
+        with self.assertRaises(ValueError):
+            primal = paddle.to_tensor(self.primal)
+            primal.stop_gradient = True
+            return paddle.grad(
+                paddle.exp(primal), primal, paddle.to_tensor(self.cotangent)
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4de565dc504f73c9e505d091c4e05ec06798d57
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core
+
+
+@param.parameterized_class(
+    ('name', 'primal', 'cotangent', 'shape', 'dtype'),
+    (
+        (
+            'same_shape',
+            np.random.rand(10, 10),
+            np.random.rand(10, 10),
+            (10, 10),
+            np.float32,
+        ),
+        (
+            'same_rank',
+            np.random.rand(1, 10),
+            np.random.rand(10, 10),
+            (10, 10),
+            np.float32,
+        ),
+        (
+            'same_rank',
+            np.random.rand(10, 1, 10, 1),
+            np.random.rand(10, 10, 10, 10),
+            (10, 10, 10, 10),
+            np.float32,
+        ),
+        (
+            'diff_rank',
+            np.random.rand(1, 10, 1),
+            np.random.rand(10, 10, 10, 10),
+            (10, 10, 10, 10),
+            np.float32,
+        ),
+    ),
+)
+class TestExpandGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primal = cls.primal.astype(cls.dtype)
+        cls.cotangent = cls.cotangent.astype(cls.dtype)
+
+    @classmethod
+    def tearDownClass(cls):
+        core.set_prim_enabled(False)
+
+    def test_comp(self):
+        def func(primal, cotangent, shape):
+            primal = paddle.to_tensor(primal)
+            primal.stop_gradient = False
+            cotangent = paddle.to_tensor(cotangent)
+            return paddle.grad(paddle.expand(primal, shape), primal, cotangent)[
+                0
+            ]
+
+        def actual(primal, cotangent, shape):
+            core.set_prim_enabled(True)
+            return func(primal, cotangent, shape)
+
+        def desired(primal, cotangent, shape):
+            core.set_prim_enabled(False)
+            return func(primal, cotangent, shape)
+
+        np.testing.assert_allclose(
+            actual=actual(self.primal, self.cotangent, self.shape),
+            desired=desired(self.primal, self.cotangent, self.shape),
+            rtol=1e-6,
+            atol=0,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..59daf91ab8b84b391e971ae6c28b75ea7e05b89f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core
+
+
+@param.parameterized_class(
+    ('name', 'primals', 'stop_gradients', 'cotangents', 'dtype'),
+    (
+        (
+            'test_normal_case',
+            (np.random.rand(2, 3, 4), np.random.rand(2, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_diff_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(3, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_same_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_stop_gradient',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, True),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_reduce_axe_empty',
+            (np.random.rand(2, 3, 3, 4), np.random.rand(2, 1, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+    ),
+)
+class TestMultiplyGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primals = tuple(primal.astype(cls.dtype) for primal in cls.primals)
+        cls.cotangents = tuple(co.astype(cls.dtype) for co in cls.cotangents)
+
+    def as_tuple(self, x):
+        return (x,) if isinstance(x, paddle.Tensor) else x
+
+    def vjp(self):
+        primals, cotangents = self.primals, self.cotangents
+        primals = tuple(paddle.to_tensor(primal) for primal in primals)
+        for primal, flag in zip(primals, self.stop_gradients):
+            primal.stop_gradient = flag
+        cotangents = tuple(paddle.to_tensor(co) for co in cotangents)
+        out = self.as_tuple(paddle.multiply(*primals))
+        grads = paddle.grad(out, primals, cotangents, allow_unused=True)
+        return [g for g in grads if g is not None]
+
+    def test_comp(self):
+        core.set_prim_enabled(True)
+        actual = self.vjp()
+
+        core.set_prim_enabled(False)
+        desired = self.vjp()
+
+        for i, j in zip(actual, desired):
+            np.testing.assert_allclose(
+                i,
+                j,
+                rtol=1e-6,
+                atol=0,
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
index 9c392663be093e966f04783172e4698a73073a8c..b7d7969d9aa0469d98e8d460c25bc17058235648 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
@@ -21,6 +21,23 @@ import paddle
 from paddle.fluid import core
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x, y):
+        tmp = self.fc(x)
+        out = paddle.add(tmp, y)
+        return out
+
+
 @param.parameterized_class(
     ('primal0', 'primal1', 'dtype'),
     [
@@ -51,17 +68,39 @@ from paddle.fluid import core
         ),
     ],
 )
-class TestDivGradComp(unittest.TestCase):
+class TestAddGradComp(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.primal0 = cls.primal0.astype(cls.dtype)
         cls.primal1 = cls.primal1.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.y = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x, self.y)
+        res = paddle.autograd.grad(out, [self.x, self.y])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-7,
+                atol=1e-7,
+            )
+        paddle.enable_static()
 
     def test_tanh_grad_comp(self):
         def actual(primal0, primal1):
@@ -73,8 +112,7 @@ class TestDivGradComp(unittest.TestCase):
                 x.stop_gradient = False
                 y.stop_gradient = False
                 z = paddle.add(x, y)
-                out = paddle.tanh(z)
-                res = paddle.static.gradients([out], [x, y])
+                res = paddle.static.gradients([z], [x, y])
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(
@@ -100,8 +138,7 @@ class TestDivGradComp(unittest.TestCase):
                 x.stop_gradient = False
                 y.stop_gradient = False
                 z = paddle.add(x, y)
-                out = paddle.tanh(z)
-                res = paddle.static.gradients([out], [x, y])
+                res = paddle.static.gradients([z], [x, y])
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
index 0325768917e43099cc8848fbe8a726e6630b0359..45cae351a73ebb98d93e526efc26e148a96ef764 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
@@ -21,6 +21,24 @@ import paddle
 from paddle.fluid import core
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x, y):
+        tmp = self.fc(x)
+        out = paddle.add(tmp, y)
+        res = paddle.tanh(out)
+        return res
+
+
 @param.parameterized_class(
     ('primal0', 'primal1', 'dtype'),
     [
@@ -57,13 +75,37 @@ class TestDivGradComp(unittest.TestCase):
         cls.primal0 = cls.primal0.astype(cls.dtype)
         cls.primal1 = cls.primal1.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.y = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x, self.y)
+        res = paddle.autograd.grad(out, [self.x, self.y])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-7,
+                atol=1e-7,
+            )
+        paddle.enable_static()
 
     def test_tanh_grad_comp(self):
+        paddle.enable_static()
+
         def actual(primal0, primal1):
             core.set_prim_enabled(True)
             mp, sp = paddle.static.Program(), paddle.static.Program()
@@ -73,7 +115,8 @@ class TestDivGradComp(unittest.TestCase):
                 x.stop_gradient = False
                 y.stop_gradient = False
                 z = paddle.add(x, y)
-                res = paddle.static.gradients([z], [x, y])
+                out = paddle.tanh(z)
+                res = paddle.static.gradients([out], [x, y])
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(
@@ -99,7 +142,8 @@ class TestDivGradComp(unittest.TestCase):
                 x.stop_gradient = False
                 y.stop_gradient = False
                 z = paddle.add(x, y)
-                res = paddle.static.gradients([z], [x, y])
+                out = paddle.tanh(z)
+                res = paddle.static.gradients([out], [x, y])
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(
@@ -129,6 +173,7 @@ class TestDivGradComp(unittest.TestCase):
             atol=0,
         )
         core.set_prim_enabled(False)
+        paddle.disable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
index fde1f4549d6934d5fdc4e8a41c0bdb65e7a5641e..1d675e8bd097968ed660f52de0c1f658803837c6 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
@@ -21,6 +21,23 @@ import paddle
 from paddle.fluid import core
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x, y):
+        tmp = self.fc(x)
+        out = paddle.divide(tmp, y)
+        return out
+
+
 @param.parameterized_class(
     ('primal0', 'primal1', 'dtype'),
     [
@@ -57,11 +74,33 @@ class TestDivGradComp(unittest.TestCase):
         cls.primal0 = cls.primal0.astype(cls.dtype)
         cls.primal1 = cls.primal1.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.y = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x, self.y)
+        res = paddle.autograd.grad(out, [self.x, self.y])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-6,
+                atol=1e-6,
+            )
+        paddle.enable_static()
 
     def test_tanh_grad_comp(self):
         def actual(primal0, primal1):
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c76631232c007f4a2a81cb2227035910bb57d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import autograd
+import autograd.numpy
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core
+
+
+@param.parameterized_class(
+    ('primal', 'cotangent', 'dtype'),
+    [
+        (np.random.rand(10, 10), np.random.rand(10, 10), np.float32),
+        (np.random.rand(10, 10), None, np.float32),
+    ],
+)
+class TestExpGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        core.set_prim_enabled(True)
+        cls.primal = cls.primal.astype(cls.dtype)
+        if cls.cotangent is not None:
+            cls.cotangent = cls.cotangent.astype(cls.dtype)
+
+    @classmethod
+    def tearDownClass(cls):
+        core.set_prim_enabled(False)
+
+    def setUp(self):
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def test_exp_grad_comp(self):
+        def actual(primal, cotangent):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x = paddle.static.data('primal', primal.shape, primal.dtype)
+                x.stop_gradient = False
+                v = (
+                    None
+                    if cotangent is None
+                    else paddle.static.data(
+                        'cotangent', cotangent.shape, cotangent.dtype
+                    )
+                )
+                y = paddle.exp(x)
+                x_cotangent = paddle.static.gradients(y, x, v)
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            return exe.run(
+                program=mp,
+                feed={'primal': primal, 'cotangent': cotangent},
+                fetch_list=x_cotangent,
+            )[0]
+
+        def desired(primal, cotangent):
+            cotangent = (
+                np.ones_like(cotangent, dtype=primal.dtype)
+                if cotangent is None
+                else cotangent
+            )
+            return autograd.make_vjp(autograd.numpy.exp)(primal)[0](cotangent)
+
+        np.testing.assert_allclose(
+            actual=actual(self.primal, self.cotangent),
+            desired=desired(self.primal, self.cotangent),
+            rtol=1e-6,
+            atol=0,
+        )
+
+    def test_stop_gradient(self):
+        def actual(primal, cotangent):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x = paddle.static.data('primal', primal.shape, primal.dtype)
+                x.stop_gradient = True
+                v = (
+                    None
+                    if cotangent is None
+                    else paddle.static.data(
+                        'cotangent', cotangent.shape, cotangent.dtype
+                    )
+                )
+                y = paddle.exp(x)
+                x_cotangent = paddle.static.gradients(y, x, v)
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            return exe.run(
+                program=mp,
+                feed={'primal': primal, 'cotangent': cotangent},
+                fetch_list=x_cotangent,
+            )
+
+        def desired(primal, cotangent):
+            return []
+
+        self.assertEqual(
+            actual(self.primal, self.cotangent),
+            desired(self.primal, self.cotangent),
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c322074d34d88715aa9aec84ca5ca6e05c88aba8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core
+
+
+@param.parameterized_class(
+    ('name', 'primal', 'cotangent', 'shape', 'dtype'),
+    (
+        (
+            'same_shape',
+            np.random.rand(10, 10),
+            np.random.rand(10, 10),
+            (10, 10),
+            np.float32,
+        ),
+        (
+            'same_rank',
+            np.random.rand(1, 10),
+            np.random.rand(10, 10),
+            (10, 10),
+            np.float32,
+        ),
+        (
+            'same_rank',
+            np.random.rand(10, 1, 10, 1),
+            np.random.rand(10, 10, 10, 10),
+            (10, 10, 10, 10),
+            np.float32,
+        ),
+        (
+            'diff_rank',
+            np.random.rand(1, 10, 1),
+            np.random.rand(10, 10, 10, 10),
+            (10, 10, 10, 10),
+            np.float32,
+        ),
+        (
+            'single_direction_broadcast',
+            np.random.rand(10, 10, 10, 10),
+            np.random.rand(1, 10, 1),
+            (10, 10, 10, 10),
+            np.float32,
+        ),
+    ),
+)
+class TestExpandGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primal = cls.primal.astype(cls.dtype)
+        cls.cotangent = cls.cotangent.astype(cls.dtype)
+        paddle.enable_static()
+
+    @classmethod
+    def tearDownClass(cls):
+        paddle.disable_static()
+        core.set_prim_enabled(False)
+
+    def test_comp(self):
+        def func(primal, cotangent, shape):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x = paddle.static.data('primal', primal.shape, primal.dtype)
+                x.stop_gradient = False
+                v = paddle.static.data(
+                    'cotangent', cotangent.shape, cotangent.dtype
+                )
+                y = paddle.expand(x, shape)
+                x_cotangent = paddle.static.gradients(y, x)
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            return exe.run(
+                program=mp,
+                feed={'primal': primal, 'cotangent': cotangent},
+                fetch_list=x_cotangent,
+            )[0]
+
+        def actual(primal, cotangent, shape):
+            core.set_prim_enabled(True)
+            return func(primal, cotangent, shape)
+
+        def desired(primal, cotangent, shape):
+            core.set_prim_enabled(False)
+            return func(primal, cotangent, shape)
+
+        np.testing.assert_allclose(
+            actual=actual(self.primal, self.cotangent, self.shape),
+            desired=desired(self.primal, self.cotangent, self.shape),
+            rtol=1e-6,
+            atol=0,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..63e8a4f1bbf3451bed5c9402a40ffa13a0bbd319
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core, framework
+
+
+@param.parameterized_class(
+    ('name', 'primals', 'stop_gradients', 'cotangents', 'dtype'),
+    (
+        (
+            'test_normal_case',
+            (np.random.rand(2, 3, 4), np.random.rand(2, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_diff_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(3, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_same_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_stop_gradient',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, True),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_reduce_axe_empty',
+            (np.random.rand(2, 3, 3, 4), np.random.rand(2, 1, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 1, 3, 1),),
+            np.float32,
+        ),
+    ),
+)
+class TestMultiplyGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primals = tuple(primal.astype(cls.dtype) for primal in cls.primals)
+        cls.cotangents = tuple(co.astype(cls.dtype) for co in cls.cotangents)
+
+    def setUp(self):
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def as_tuple(self, x):
+        return (x,) if isinstance(x, framework.Variable) else x
+
+    def vjp(self):
+        primals, cotangents = self.primals, self.cotangents
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            primals = tuple(
+                paddle.static.data(f'primal{i}', primal.shape, primal.dtype)
+                for i, primal in enumerate(primals)
+            )
+            for primal, flag in zip(primals, self.stop_gradients):
+                primal.stop_gradient = flag
+            cotangents = tuple(
+                paddle.static.data(f'cotangent{i}', co.shape, co.dtype)
+                for i, co in enumerate(cotangents)
+            )
+            out = self.as_tuple(paddle.multiply(*primals))
+            grads = paddle.static.gradients(out, primals)
+        exe = paddle.static.Executor()
+        exe.run(sp)
+        return exe.run(
+            program=mp,
+            feed={
+                **{
+                    f'primal{i}': primal
+                    for i, primal in enumerate(self.primals)
+                },
+                **{f'cotangent{i}': co for i, co in enumerate(self.cotangents)},
+            },
+            fetch_list=[g for g in grads if g is not None],
+        )
+
+    def test_comp(self):
+
+        core.set_prim_enabled(True)
+        actual = self.vjp()
+
+        core.set_prim_enabled(False)
+        desired = self.vjp()
+
+        self.assertEqual(len(actual), len(desired))
+        for i, j in zip(actual, desired):
+            np.testing.assert_allclose(
+                i,
+                j,
+                rtol=1e-6,
+                atol=0,
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
index 2eae9c86e25fba10f97d5e64ba3e4098abb2a671..505a4391138e95adb376924499cb95bc43fcb5cb 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
@@ -26,6 +26,23 @@ import parameterized as param
 import paddle
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x):
+        tmp = self.fc(x)
+        out = paddle.sqrt(tmp)
+        return out
+
+
 @param.parameterized_class(
     ('primal', 'cotangent', 'dtype'),
     [
@@ -38,11 +55,31 @@ class TestSqrtGradComp(unittest.TestCase):
         cls.primal = cls.primal.astype(cls.dtype)
         cls.cotangent = cls.cotangent.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x)
+        res = paddle.autograd.grad(out, [self.x])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-7,
+                atol=1e-7,
+            )
+        paddle.enable_static()
 
     def test_sqrt_grad_comp(self):
         def actual(primal, cotangent):
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
index d7cab193a9910e2ad2999efb3743514229445400..f98a6af621f96f32e99f6d5f46afd5c297e6a528 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
@@ -21,6 +21,23 @@ import paddle
 from paddle.fluid import core
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x, y):
+        tmp = self.fc(x)
+        out = paddle.subtract(tmp, y)
+        return out
+
+
 @param.parameterized_class(
     ('primal0', 'primal1', 'dtype'),
     [
@@ -39,14 +56,15 @@ from paddle.fluid import core
             np.random.rand(2, 3, 1, 4),
             np.float32,
         ),
+        (np.random.rand(2, 3, 3, 4), np.random.rand(2, 3, 1, 4), np.float32),
         (
-            np.random.rand(2, 3, 3, 4),
+            np.random.rand(2, 1, 3, 4),
             np.random.rand(2, 3, 1, 4),
             np.float32,
         ),
         (
             np.random.rand(2, 3, 3, 4),
-            np.random.rand(2, 3, 1, 1),
+            np.random.rand(2, 1, 1, 4),
             np.float32,
         ),
     ],
@@ -57,11 +75,33 @@ class TestDivGradComp(unittest.TestCase):
         cls.primal0 = cls.primal0.astype(cls.dtype)
         cls.primal1 = cls.primal1.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.y = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x, self.y)
+        res = paddle.autograd.grad(out, [self.x, self.y])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-7,
+                atol=1e-7,
+            )
+        paddle.enable_static()
 
     def test_tanh_grad_comp(self):
         def actual(primal0, primal1):
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
index 445b371b0a5a71711cc95a4f060a65fecf8dde11..c7c9109eeaab0403b87e74a0d9edea8ebe995e21 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
@@ -26,6 +26,23 @@ import parameterized as param
 import paddle
 
 
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(PrimeNet, self).__init__()
+        self.fc = paddle.nn.Linear(4, 4)
+
+    def forward(self, x):
+        tmp = self.fc(x)
+        out = paddle.tanh(tmp)
+        return out
+
+
 @param.parameterized_class(
     ('primal', 'cotangent', 'dtype'),
     [
@@ -38,11 +55,31 @@ class TestTanhGradComp(unittest.TestCase):
         cls.primal = cls.primal.astype(cls.dtype)
         cls.cotangent = cls.cotangent.astype(cls.dtype)
 
-    def setUp(self):
-        paddle.enable_static()
+    def train(self, use_prim, use_cinn):
+        paddle.seed(2022)
+        self.x = paddle.randn([2, 4])
+        self.x.stop_gradient = False
+        net = PrimeNet()
+        core.set_prim_enabled(use_prim)
+        net = apply_to_static(net, use_cinn)
+        out = net(self.x)
+        res = paddle.autograd.grad(out, [self.x])
+
+        return res
 
-    def tearDown(self):
+    def test_cinn(self):
         paddle.disable_static()
+        dy_res = self.train(use_prim=False, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+
+        for i in range(len(dy_res)):
+            np.testing.assert_allclose(
+                comp_st_cinn_res[i].numpy(),
+                dy_res[i].numpy(),
+                rtol=1e-7,
+                atol=1e-7,
+            )
+        paddle.enable_static()
 
     def test_tanh_grad_comp(self):
         def actual(primal, cotangent):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 5026ae9fc96d478e78d5596a22a507c07dde18b1..48349cfe910b35f35e1e19399c39bd40a0b63726 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -96,11 +96,9 @@ class TestTanhTripleGradCheck(unittest.TestCase):
         gradient_checker.triple_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.tanh_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -128,11 +126,9 @@ class TestTanhDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.tanh_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -160,11 +156,9 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.abs_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -256,11 +250,9 @@ class TestELUDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.elu_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -292,11 +284,9 @@ class TestCELUDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.celu_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -390,11 +380,9 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.square_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -424,11 +412,9 @@ class TestLogDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.log_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -456,11 +442,9 @@ class TestSinDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.sin_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -488,11 +472,9 @@ class TestCosDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.cos_wrapper, [x], y, x_init=x_arr, place=place
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 078811b4969995f7db2fb112f10ab63a2404b881..02f649b39bfab231fe0691c39b217e017ebf3a9a 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -37,16 +37,12 @@ class TestAssignOp(op_test.OpTest):
 
     def test_forward(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_output(check_eager=True)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
 
     def test_backward(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_grad(['X'], 'Out', check_eager=True)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
 
 
@@ -60,23 +56,18 @@ class TestAssignFP16Op(op_test.OpTest):
 
     def test_forward(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_output(check_eager=True)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
 
     def test_backward(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_grad(['X'], 'Out', check_eager=True)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
 
 
 class TestAssignOpWithLoDTensorArray(unittest.TestCase):
     def test_assign_LoDTensorArray(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program):
@@ -92,7 +83,6 @@ class TestAssignOpWithLoDTensorArray(unittest.TestCase):
             sums = paddle.tensor.array_read(array=init_array, i=i)
             mean = paddle.mean(sums)
             append_backward(mean)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
         place = (
             fluid.CUDAPlace(0)
@@ -207,12 +197,13 @@ class TestAssignOApi(unittest.TestCase):
         np.testing.assert_allclose(result3.numpy(), np.array([1]), rtol=1e-05)
 
     def test_clone(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.python_api = paddle.clone
 
         x = paddle.ones([2])
         x.stop_gradient = False
+        x.retain_grads()
         clone_x = paddle.clone(x)
+        clone_x.retain_grads()
 
         y = clone_x**3
         y.backward()
@@ -220,7 +211,6 @@ class TestAssignOApi(unittest.TestCase):
         np.testing.assert_array_equal(x, [1, 1])
         np.testing.assert_array_equal(clone_x.grad.numpy(), [3, 3])
         np.testing.assert_array_equal(x.grad.numpy(), [3, 3])
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.enable_static()
 
         with program_guard(Program(), Program()):
@@ -241,7 +231,6 @@ class TestAssignOApi(unittest.TestCase):
 class TestAssignOpErrorApi(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with program_guard(Program(), Program()):
             # The type of input must be Variable or numpy.ndarray.
             x1 = fluid.create_lod_tensor(
@@ -251,7 +240,6 @@ class TestAssignOpErrorApi(unittest.TestCase):
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
             x2 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, paddle.assign, x2)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
 
     def test_type_error(self):
@@ -281,7 +269,6 @@ class TestAssignDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.assign_wrapper, [data], out, x_init=[data_arr], place=place
         )
@@ -313,7 +300,6 @@ class TestAssignTripleGradCheck(unittest.TestCase):
         gradient_checker.triple_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.assign_wrapper, [data], out, x_init=[data_arr], place=place
         )
diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py
index 9123b4b009d1866d7ce9385777e4ddc5ffc0d5e0..675b51cf0a0535e0cc34f900477e3e2aaf8ee76a 100644
--- a/python/paddle/fluid/tests/unittests/test_case.py
+++ b/python/paddle/fluid/tests/unittests/test_case.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 import paddle.fluid.optimizer as optimizer
+from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Program, program_guard
 
 paddle.enable_static()
@@ -145,10 +146,101 @@ class TestAPICase(unittest.TestCase):
             )
 
             np.testing.assert_allclose(res[0], 1, rtol=1e-05)
+            self.assertEqual(res[0].shape, ())
             np.testing.assert_allclose(res[1], 2, rtol=1e-05)
+            self.assertEqual(res[1].shape, ())
             np.testing.assert_allclose(res[2], 3, rtol=1e-05)
+            self.assertEqual(res[2].shape, ())
             np.testing.assert_allclose(res[3], 2, rtol=1e-05)
+            self.assertEqual(res[3].shape, ())
             np.testing.assert_allclose(res[4], 2, rtol=1e-05)
+            self.assertEqual(res[4].shape, ())
+
+    def test_0d_tensor_backward(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program, startup_program):
+            x = paddle.full(shape=[], dtype='float32', fill_value=-2.0)
+            x.stop_gradient = False
+            pred = paddle.full(shape=[], dtype='bool', fill_value=0)
+            # pred is False, so out = -x
+            out = paddle.static.nn.case(
+                pred_fn_pairs=[(pred, lambda: x)], default=lambda: -x
+            )
+            append_backward(out)
+
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
+        exe = fluid.Executor(place)
+
+        res = exe.run(main_program, fetch_list=[out.name, x.grad_name])
+        np.testing.assert_allclose(
+            np.asarray(res[0]), np.array(2.0), rtol=1e-05
+        )
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(
+            np.asarray(res[1]), np.array(-1.0), rtol=1e-05
+        )
+        self.assertEqual(res[1].shape, ())
+
+    def test_0d_tensor_dygraph(self):
+        paddle.disable_static()
+
+        def fn_1():
+            return paddle.full(shape=[], dtype='int32', fill_value=1)
+
+        def fn_2():
+            return paddle.full(shape=[], dtype='int32', fill_value=2)
+
+        def fn_3():
+            return paddle.full(shape=[], dtype='int32', fill_value=3)
+
+        x = paddle.full(shape=[], dtype='float32', fill_value=0.3)
+        y = paddle.full(shape=[], dtype='float32', fill_value=0.1)
+        z = paddle.full(shape=[], dtype='float32', fill_value=0.2)
+        pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
+        pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
+
+        # call fn_1
+        out_0 = paddle.static.nn.control_flow.case(
+            pred_fn_pairs=[(pred_1, fn_1), (pred_1, fn_2)], default=fn_3
+        )
+
+        # call fn_2
+        out_1 = paddle.static.nn.control_flow.case(
+            pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3
+        )
+
+        # call default fn_3
+        out_2 = paddle.static.nn.control_flow.case(
+            pred_fn_pairs=((pred_2, fn_1), (pred_2, fn_2)), default=fn_3
+        )
+
+        # no default, call fn_2
+        out_3 = paddle.static.nn.control_flow.case(
+            pred_fn_pairs=[(pred_1, fn_2)]
+        )
+
+        # no default, call fn_2. but pred_2 is false
+        out_4 = paddle.static.nn.control_flow.case(
+            pred_fn_pairs=[(pred_2, fn_2)]
+        )
+
+        np.testing.assert_allclose(out_0, 1, rtol=1e-05)
+        self.assertEqual(out_0.shape, [])
+        np.testing.assert_allclose(out_1, 2, rtol=1e-05)
+        self.assertEqual(out_1.shape, [])
+        np.testing.assert_allclose(out_2, 3, rtol=1e-05)
+        self.assertEqual(out_2.shape, [])
+        np.testing.assert_allclose(out_3, 2, rtol=1e-05)
+        self.assertEqual(out_3.shape, [])
+        np.testing.assert_allclose(out_4, 2, rtol=1e-05)
+        self.assertEqual(out_4.shape, [])
+
+        paddle.enable_static()
 
     def test_return_var_tuple(self):
         def fn_1():
@@ -394,8 +486,11 @@ class TestAPICase_Nested(unittest.TestCase):
             res = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
 
             np.testing.assert_allclose(res[0], 1, rtol=1e-05)
+            self.assertEqual(res[0].shape, ())
             np.testing.assert_allclose(res[1], 2, rtol=1e-05)
+            self.assertEqual(res[1].shape, ())
             np.testing.assert_allclose(res[2], 3, rtol=1e-05)
+            self.assertEqual(res[2].shape, ())
 
 
 class TestAPICase_Error(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 9769aa8df430e70e46ae84fc3a12ea7221c1abe7..1b1c08e51e2cd16c7d85a052f21101ff1f5efd99 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -103,6 +103,7 @@ class TestCondInputOutput(unittest.TestCase):
         exe = fluid.Executor(place)
         (ret,) = exe.run(main_program, fetch_list=[out.name])
         np.testing.assert_allclose(np.asarray(ret), np.array(2), rtol=1e-05)
+        self.assertEqual(ret.shape, ())
 
     def test_0d_tensor_as_cond(self):
         """
@@ -129,7 +130,7 @@ class TestCondInputOutput(unittest.TestCase):
             y = paddle.full(shape=[], dtype='float32', fill_value=0.23)
             pred = paddle.greater_equal(y, x)
             out = paddle.static.nn.cond(pred, true_func, false_func)
-            # out is one tensor
+            # out is a tensor
 
         place = (
             fluid.CUDAPlace(0)
@@ -168,14 +169,41 @@ class TestCondInputOutput(unittest.TestCase):
             if core.is_compiled_with_cuda()
             else fluid.CPUPlace()
         )
+
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=[out.name, a.grad_name])
         np.testing.assert_allclose(
             np.asarray(ret[0]), np.array(2.0), rtol=1e-05
         )
+        self.assertEqual(ret[0].shape, ())
         np.testing.assert_allclose(
             np.asarray(ret[1]), np.array(-1.0), rtol=1e-05
         )
+        self.assertEqual(ret[1].shape, ())
+
+    def test_0d_tensor_dygraph(self):
+        """
+        pseudocode:
+
+        a = -2.0
+        if a >= 0:
+            return a
+        else:
+            return -a
+        """
+        paddle.disable_static()
+        a = paddle.full(shape=[], dtype='float32', fill_value=-2.0)
+        a.stop_gradient = False
+        out = paddle.static.nn.cond(a >= 0, lambda: a, lambda: -a)
+        out.backward()
+
+        np.testing.assert_allclose(np.asarray(out), np.array(2.0), rtol=1e-05)
+        self.assertEqual(out.shape, [])
+
+        np.testing.assert_allclose(
+            np.asarray(a.grad), np.array(-1.0), rtol=1e-05
+        )
+        self.assertEqual(a.grad.shape, [])
 
     def test_return_var_tuple(self):
         """
@@ -527,9 +555,11 @@ class TestCondNestedControlFlow(unittest.TestCase):
         np.testing.assert_allclose(
             np.asarray(ret[0]), np.array(7.0), rtol=1e-05
         )
+        self.assertEqual(ret[0].shape, ())
         np.testing.assert_allclose(
             np.asarray(ret[1]), np.array(2.0), rtol=1e-05
         )
+        self.assertEqual(ret[1].shape, ())
 
     def test_cond_op_in_condition(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index d8ba91bad7b8a530fe3137b00b670f5e0878cf04..edfa7665882ca6e8180a5302114b2b5972585f9b 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -18,18 +18,16 @@ import shutil
 import unittest
 
 import numpy as np
-from simple_nets import simple_fc_net_with_inputs
 
 import paddle
 from paddle.device.cuda.graphs import CUDAGraph
-from paddle.fluid.dygraph.base import switch_to_static_graph
 
 
 def can_use_cuda_graph():
     return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()
 
 
-class TestCUDAGraph(unittest.TestCase):
+class TestCUDAGraphInDygraphMode(unittest.TestCase):
     def setUp(self):
         if can_use_cuda_graph():
             paddle.set_flags(
@@ -46,94 +44,6 @@ class TestCUDAGraph(unittest.TestCase):
             np.random.randint(low=0, high=10, size=shape).astype("float32")
         )
 
-    @switch_to_static_graph
-    def test_cuda_graph_static_graph(self):
-        if not can_use_cuda_graph():
-            return
-
-        seed = 100
-        loss_cuda_graph = self.cuda_graph_static_graph_main(
-            seed, use_cuda_graph=True
-        )
-        loss_no_cuda_graph = self.cuda_graph_static_graph_main(
-            seed, use_cuda_graph=False
-        )
-        self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
-
-    def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
-        batch_size = 1
-        class_num = 10
-        image_shape = [batch_size, 784]
-        label_shape = [batch_size, 1]
-
-        paddle.seed(seed)
-        np.random.seed(seed)
-        startup = paddle.static.Program()
-        main = paddle.static.Program()
-        with paddle.static.program_guard(main, startup):
-            image = paddle.static.data(
-                name="image", shape=image_shape, dtype='float32'
-            )
-            label = paddle.static.data(
-                name="label", shape=label_shape, dtype='int64'
-            )
-            image.persistable = True
-            label.persistable = True
-            loss = simple_fc_net_with_inputs(image, label, class_num)
-            loss.persistable = True
-            lr = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04]
-            )
-            optimizer = paddle.optimizer.SGD(learning_rate=lr)
-            optimizer.minimize(loss)
-        place = paddle.CUDAPlace(0)
-        exe = paddle.static.Executor(place)
-        scope = paddle.static.Scope()
-        with paddle.static.scope_guard(scope):
-            exe.run(startup)
-            build_strategy = paddle.static.BuildStrategy()
-            build_strategy.allow_cuda_graph_capture = True
-            build_strategy.fix_op_run_order = True
-            build_strategy.fuse_all_optimizer_ops = True
-            compiled_program = paddle.static.CompiledProgram(
-                main
-            ).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy, places=place
-            )
-            image_t = scope.var(image.name).get_tensor()
-            label_t = scope.var(label.name).get_tensor()
-            loss_t = scope.var(loss.name).get_tensor()
-            lr_var = main.global_block().var(lr._var_name)
-            self.assertTrue(lr_var.persistable)
-            lr_t = scope.var(lr_var.name).get_tensor()
-            cuda_graph = None
-            for batch_id in range(20):
-                image_t.set(
-                    np.random.rand(*image_shape).astype('float32'), place
-                )
-                label_t.set(
-                    np.random.randint(
-                        low=0, high=class_num, size=label_shape, dtype='int64'
-                    ),
-                    place,
-                )
-
-                if batch_id == 1 and use_cuda_graph:
-                    cuda_graph = CUDAGraph(place, mode="global")
-                    cuda_graph.capture_begin()
-                    exe.run(compiled_program)
-                    cuda_graph.capture_end()
-
-                if cuda_graph:
-                    lr_t.set(np.array([lr()], dtype='float32'), place)
-                    cuda_graph.replay()
-                else:
-                    exe.run(compiled_program)
-                lr.step()
-            if cuda_graph:
-                cuda_graph.reset()
-        return np.array(loss_t)
-
     def test_cuda_graph_dynamic_graph(self):
         if not can_use_cuda_graph():
             return
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..e159334c87a6492e50d51d057c6c5ab8513a9e96
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from simple_nets import simple_fc_net_with_inputs
+
+import paddle
+from paddle.device.cuda.graphs import CUDAGraph
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+
+def can_use_cuda_graph():
+    return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()
+
+
+class TestCUDAGraphInStaticMode(unittest.TestCase):
+    def setUp(self):
+        if can_use_cuda_graph():
+            # The behavior of `FLAGS_use_stream_safe_cuda_allocator` in static
+            # mode is inconsistent with that in dygraph mode.
+            # In static mode, FLAGS_use_stream_safe_cuda_allocator must be True.
+            # In dygraph mode, FLAGS_use_stream_safe_cuda_allocator must be False.
+            # These two types of unittests need to be written separately, because
+            # the allocator may only be initialized once, and the flag
+            # `FLAGS_use_stream_safe_cuda_allocator` only takes effect during
+            # initialization.
+            paddle.set_flags(
+                {
+                    'FLAGS_allocator_strategy': 'auto_growth',
+                    'FLAGS_sync_nccl_allreduce': False,
+                    'FLAGS_cudnn_deterministic': True,
+                    'FLAGS_use_stream_safe_cuda_allocator': True,
+                }
+            )
+
+    @switch_to_static_graph
+    def test_cuda_graph_static_graph(self):
+        if not can_use_cuda_graph():
+            return
+
+        seed = 100
+        loss_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=True
+        )
+        loss_no_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=False
+        )
+        self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
+
+    def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
+        batch_size = 1
+        class_num = 10
+        image_shape = [batch_size, 784]
+        label_shape = [batch_size, 1]
+
+        paddle.seed(seed)
+        np.random.seed(seed)
+        startup = paddle.static.Program()
+        main = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            image = paddle.static.data(
+                name="image", shape=image_shape, dtype='float32'
+            )
+            label = paddle.static.data(
+                name="label", shape=label_shape, dtype='int64'
+            )
+            image.persistable = True
+            label.persistable = True
+            loss = simple_fc_net_with_inputs(image, label, class_num)
+            loss.persistable = True
+            lr = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04]
+            )
+            optimizer = paddle.optimizer.SGD(learning_rate=lr)
+            optimizer.minimize(loss)
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(startup)
+            build_strategy = paddle.static.BuildStrategy()
+            build_strategy.allow_cuda_graph_capture = True
+            build_strategy.fix_op_run_order = True
+            build_strategy.fuse_all_optimizer_ops = True
+            compiled_program = paddle.static.CompiledProgram(
+                main
+            ).with_data_parallel(
+                loss_name=loss.name, build_strategy=build_strategy, places=place
+            )
+            image_t = scope.var(image.name).get_tensor()
+            label_t = scope.var(label.name).get_tensor()
+            loss_t = scope.var(loss.name).get_tensor()
+            lr_var = main.global_block().var(lr._var_name)
+            self.assertTrue(lr_var.persistable)
+            lr_t = scope.var(lr_var.name).get_tensor()
+            cuda_graph = None
+            for batch_id in range(20):
+                image_t.set(
+                    np.random.rand(*image_shape).astype('float32'), place
+                )
+                label_t.set(
+                    np.random.randint(
+                        low=0, high=class_num, size=label_shape, dtype='int64'
+                    ),
+                    place,
+                )
+
+                if batch_id == 1 and use_cuda_graph:
+                    cuda_graph = CUDAGraph(place, mode="global")
+                    cuda_graph.capture_begin()
+                    exe.run(compiled_program)
+                    cuda_graph.capture_end()
+
+                if cuda_graph:
+                    lr_t.set(np.array([lr()], dtype='float32'), place)
+                    cuda_graph.replay()
+                else:
+                    exe.run(compiled_program)
+                lr.step()
+            if cuda_graph:
+                cuda_graph.reset()
+        return np.array(loss_t)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 4b0cae035b06dd2c5552f84bfe136a311c1a3055..3e21537cdecf01840980172dd1b502b168fb12b9 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -16,15 +16,12 @@ import os
 import tempfile
 import unittest
 
-import gradient_checker
 import numpy as np
-from decorator_helper import prog_scope
 from op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 import paddle.inference as paddle_infer
 
 
@@ -230,7 +227,7 @@ class TestSumOpExclusive1(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((4, 5, 65)).astype("float64")
+        a = np.random.random((4, 5, 20)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -245,12 +242,15 @@ class TestSumOpExclusive1(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpExclusive2(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((1, 1, 888)).astype("float64")
+        a = np.random.random((1, 1, 100)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -265,12 +265,15 @@ class TestSumOpExclusive2(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpExclusive3(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((4, 5, 888)).astype("float32")
+        a = np.random.random((4, 5, 20)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -285,12 +288,15 @@ class TestSumOpExclusive3(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpExclusive4(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((1, 1, 3049)).astype("float64")
+        a = np.random.random((1, 1, 100)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -305,12 +311,15 @@ class TestSumOpExclusive4(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpExclusive5(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((4, 5, 3096)).astype("float64")
+        a = np.random.random((4, 5, 40)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -325,12 +334,15 @@ class TestSumOpExclusive5(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpExclusiveFP16(OpTest):
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True, "dtype": "float16"}
-        a = np.random.random((4, 5, 3096)).astype("float64")
+        a = np.random.random((4, 5, 20)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
             'Out': np.concatenate(
@@ -345,6 +357,9 @@ class TestSumOpExclusiveFP16(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestSumOpReverseExclusive(OpTest):
     def setUp(self):
@@ -366,6 +381,9 @@ class TestSumOpReverseExclusive(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class BadInputTest(unittest.TestCase):
     def test_error(self):
@@ -407,7 +425,6 @@ class TestTensorAxis(unittest.TestCase):
         with paddle.static.program_guard(main_prog, starup_prog):
             # run static
             x = paddle.static.data(shape=np_x.shape, name='x', dtype=np_x.dtype)
-            print(x)
             linear = paddle.nn.Linear(np_x.shape[-1], np_x.shape[-1])
             linear_out = linear(x)
             relu_out = paddle.nn.functional.relu(linear_out)
@@ -444,67 +461,5 @@ class TestTensorAxis(unittest.TestCase):
             np.testing.assert_allclose(static_out[0], infer_out)
 
 
-class TestCumsumDoubleGradCheck(unittest.TestCase):
-    def cumsum_wrapper(self, x):
-        return paddle.cumsum(x[0], 0)
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float64
-
-        data = layers.data('data', [3, 4], False, dtype)
-        data.persistable = True
-        out = paddle.cumsum(data, 0)
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.double_grad_check_for_dygraph(
-            self.cumsum_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestCumsumTripleGradCheck(unittest.TestCase):
-    def cumsum_wrapper(self, x):
-        return paddle.cumsum(x[0], 0)
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float32
-
-        data = layers.data('data', [2, 3], False, dtype)
-        data.persistable = True
-        out = paddle.cumsum(data, 0)
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.triple_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.triple_grad_check_for_dygraph(
-            self.cumsum_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index f1a790d8e89c11e9b7ff0400196e5a9a160f7e0e..5d0b06d02f80cf5513bd6a3f47bf2b9148450145 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -436,7 +436,7 @@ class TestFleetBase(unittest.TestCase):
             )
 
         if tr0_ret != 0 or tr1_ret != 0:
-            if is_listen_failed(ps0_err) or is_listen_failed(ps1_err):
+            if is_listen_failed(ps0_err_log) or is_listen_failed(ps1_err_log):
                 print("find parameter server port bind failed, skip the error")
                 tr0_ret, tr1_ret = 0, 0
             else:
diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py
index 71283aceaa4030cc79751677dba9fd98dc680d3a..dc52df4226a29733def185d9afa080a115191aac 100644
--- a/python/paddle/fluid/tests/unittests/test_histogram_op.py
+++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py
@@ -153,6 +153,14 @@ class TestHistogramOp(OpTest):
         self.check_output(check_eager=True)
 
 
+class TestHistogramOp_ZeroDim(TestHistogramOp):
+    def init_test_case(self):
+        self.in_shape = []
+        self.bins = 5
+        self.min = 1
+        self.max = 5
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
index 3f1283864dfbe047fabe9fb5cfd08b23bdb64f7a..63322b3f6d868f1f89557be4e4b38bc4b838df69 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
@@ -166,6 +166,7 @@ class TestDygraphTripleGrad(TestCase):
     @dygraph_guard
     def func_example_with_gradient_and_create_graph(self):
         x = random_var(self.shape)
+        x.retain_grads()
         x_np = x.numpy()
         x.stop_gradient = False
 
@@ -222,10 +223,8 @@ class TestDygraphTripleGrad(TestCase):
         np.testing.assert_allclose(dddx_grad_actual, dddx_expected, rtol=1e-05)
 
     def test_all_cases(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.func_exception()
         self.func_example_with_gradient_and_create_graph()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestDygraphTripleGradBradcastCase(TestCase):
@@ -259,6 +258,7 @@ class TestDygraphTripleGradBradcastCase(TestCase):
     @dygraph_guard
     def func_example_with_gradient_and_create_graph(self):
         x = random_var(self.x_shape)
+        x.retain_grads()
         x_np = x.numpy()
         x.stop_gradient = False
 
@@ -316,9 +316,7 @@ class TestDygraphTripleGradBradcastCase(TestCase):
         np.testing.assert_allclose(dddx_grad_actual, dddx_expected, rtol=1e-05)
 
     def test_all_cases(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.func_example_with_gradient_and_create_graph()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 # d_ddout is none, dtype is float32
diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
index eff73a4548f0eec2c67661a3332611ea25e9ab58..54356f4f8e999d357f10eab994b50da54fe5283b 100644
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -72,7 +72,6 @@ class TestVariable(unittest.TestCase):
             np.testing.assert_array_equal(res1.numpy(), res2.numpy())
 
     def test_trace_backward(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with fluid.dygraph.guard():
             a = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
             b = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
@@ -80,8 +79,11 @@ class TestVariable(unittest.TestCase):
             y = fluid.dygraph.to_variable(b)
             x.stop_gradient = False
             y.stop_gradient = False
+            x.retain_grads()
+            y.retain_grads()
 
             loss = _legacy_C_ops.elementwise_mul(x, y)
+            loss.retain_grads()
 
             loss.backward()
             x_grad = x.gradient()
@@ -89,7 +91,6 @@ class TestVariable(unittest.TestCase):
 
             np.testing.assert_array_equal(x_grad, loss.gradient() * b)
             np.testing.assert_array_equal(y_grad, loss.gradient() * a)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 887ce9ff3f7411bb01115e8d648b53be0ec7de31..0fb23bf73d58e11e65c99c57bf9dec05a9b5838c 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -505,17 +505,18 @@ class TestReshapeZeroTensor(unittest.TestCase):
 class TestReshapeAPI_ZeroDim(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         x = paddle.rand([])
         x.stop_gradient = False
 
         out = paddle.reshape(x, [1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
 
         out = paddle.reshape(x, [-1, 1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1, 1])
@@ -524,6 +525,7 @@ class TestReshapeAPI_ZeroDim(unittest.TestCase):
         x = paddle.rand([1])
         x.stop_gradient = False
         out = paddle.reshape(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1])
         self.assertEqual(out.shape, [])
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index 7538fffb808ef17caf10096cb1dde41ed2fd1ef8..35686f843dec6f84ba4e698318404dfb40a99e94 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -26,7 +26,7 @@ from paddle.fluid.executor import (
     _is_dy2st_enable_standalone_executor,
     _is_enable_standalone_executor,
 )
-from paddle.fluid.framework import _in_eager_mode_
+from paddle.fluid.framework import global_var
 from paddle.fluid.layers.utils import _hash_with_id
 
 paddle.enable_static()
@@ -177,7 +177,7 @@ class RunProgramOpTest(unittest.TestCase):
 
     def prepare_dygraph_input(self, place, return_param_list=False):
         def create_var_base(is_input, name, np_value, stop_gradient):
-            if _in_eager_mode_:
+            if global_var._in_eager_mode_:
                 var = core.eager.Tensor(
                     value=np_value, name=name, place=place, zero_copy=True
                 )
@@ -218,7 +218,7 @@ class RunProgramOpTest(unittest.TestCase):
         for name in self.output_names['Out']:
             outputs['Out'].append(create_var_base(False, name))
 
-        if _in_eager_mode_:
+        if global_var._in_eager_mode_:
             outputs['OutScope'] = [core.Scope()]
         else:
             outputs['OutScope'] = framework._varbase_creator(
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index af76b09047bce619ae6bfadd7dcdccf5ee807f6a..12838b218b43eadb6c2d8e45cfde4c7094a33a2d 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -895,7 +895,6 @@ class TestSliceDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.slice_wrapper, [data], out, x_init=[data_arr], place=place
         )
@@ -931,7 +930,6 @@ class TestSliceTripleGradCheck(unittest.TestCase):
         gradient_checker.triple_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.slice_wrapper, [data], out, x_init=[data_arr], place=place
         )
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
index e2a98b170e91ca013a1756b82460ec05bd53f826..96ea87dd1b9e19559655e5ef0970f5b2fe38153a 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
@@ -43,7 +43,6 @@ class TestSparseElementWiseAPI(unittest.TestCase):
     """
 
     def setUp(self):
-        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         np.random.seed(2022)
         self.op_list = op_list
         self.csr_shape = [128, 256]
@@ -109,7 +108,9 @@ class TestSparseElementWiseAPI(unittest.TestCase):
                     y, dtype=dtype, stop_gradient=False
                 )
                 coo_x = s_dense_x.to_sparse_coo(sparse_dim)
+                coo_x.retain_grads()
                 coo_y = s_dense_y.to_sparse_coo(sparse_dim)
+                coo_y.retain_grads()
 
                 actual_res = get_actual_res(coo_x, coo_y, op)
                 actual_res.backward(actual_res)
@@ -157,9 +158,12 @@ class TestSparseElementWiseAPI(unittest.TestCase):
         sp_a = sparse.sparse_coo_tensor(
             indices_data, values1_data, shape, stop_gradient=False
         )
+        sp_a.retain_grads()
+
         sp_b = sparse.sparse_coo_tensor(
             indices_data, values2_data, shape, stop_gradient=False
         )
+        sp_b.retain_grads()
 
         values1 = paddle.to_tensor(values1_data, stop_gradient=False)
         values2 = paddle.to_tensor(values2_data, stop_gradient=False)
@@ -185,6 +189,7 @@ class TestSparseElementWiseAPI(unittest.TestCase):
         sp_a = sparse.sparse_coo_tensor(
             indices_data, values_data, shape, stop_gradient=False
         )
+        sp_a.retain_grads()
 
         bias_values = [1.0, 2.0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 5cd625770ba7ec0061864ab82a72b7e2f88379d0..5d6d83d6586e74037cc327d4694fed2029f9f0b8 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -17,7 +17,6 @@ import unittest
 import numpy as np
 
 import paddle
-import paddle.fluid as fluid
 import paddle.fluid.core as core
 
 devices = ['cpu', 'gpu']
@@ -148,7 +147,6 @@ class TestSparseConvert(unittest.TestCase):
         assert np.array_equal(dense_x.grad.numpy(), out_grad.to_dense().numpy())
 
     def test_coo_to_dense(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
         values = [1.0, 2.0, 3.0, 4.0, 5.0]
         indices_dtypes = ['int32', 'int64']
@@ -159,6 +157,7 @@ class TestSparseConvert(unittest.TestCase):
                 shape=[3, 4],
                 stop_gradient=False,
             )
+            sparse_x.retain_grads()
             dense_tensor = sparse_x.to_dense()
             # test to_dense_grad backward
             out_grad = [
@@ -180,12 +179,12 @@ class TestSparseConvert(unittest.TestCase):
                 shape=[3, 4],
                 stop_gradient=False,
             )
+            sparse_x_cpu.retain_grads()
             dense_tensor_cpu = sparse_x_cpu.to_dense()
             dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
             assert np.array_equal(
                 correct_x_grad, sparse_x_cpu.grad.values().numpy()
             )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_to_sparse_csr(self):
         x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
@@ -202,7 +201,6 @@ class TestSparseConvert(unittest.TestCase):
         assert np.array_equal(dense_tensor.numpy(), x)
 
     def test_coo_values_grad(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
         values = [1.0, 2.0, 3.0, 4.0, 5.0]
         sparse_x = paddle.sparse.sparse_coo_tensor(
@@ -211,6 +209,7 @@ class TestSparseConvert(unittest.TestCase):
             shape=[3, 4],
             stop_gradient=False,
         )
+        sparse_x.retain_grads()
         values_tensor = sparse_x.values()
         out_grad = [2.0, 3.0, 5.0, 8.0, 9.0]
         # test coo_values_grad
@@ -230,6 +229,7 @@ class TestSparseConvert(unittest.TestCase):
             shape=[3, 4, 2],
             stop_gradient=False,
         )
+        sparse_x.retain_grads()
         values_tensor = sparse_x.values()
         out_grad = [
             [2.0, 2.0],
@@ -241,7 +241,6 @@ class TestSparseConvert(unittest.TestCase):
         # test coo_values_grad
         values_tensor.backward(paddle.to_tensor(out_grad))
         assert np.array_equal(out_grad, sparse_x.grad.values().numpy())
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_sparse_coo_tensor_grad(self):
         for device in devices:
diff --git a/python/paddle/fluid/tests/unittests/test_switch_case.py b/python/paddle/fluid/tests/unittests/test_switch_case.py
index 2ddbd0f7ff051e246d3e6e3bb54cc2bddab25e4a..3fad3bdfd0c0db108fe20764d28d59766a91755c 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_case.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_case.py
@@ -21,6 +21,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
+from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Program, program_guard
 
 paddle.enable_static()
@@ -93,25 +94,25 @@ class TestAPISwitchCase(unittest.TestCase):
                 res[1],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[1], 2),
             )
             np.testing.assert_allclose(
                 res[2],
                 3,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 3),
+                err_msg='result is {} but answer is {}'.format(res[2], 3),
             )
             np.testing.assert_allclose(
                 res[3],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[3], 2),
             )
             np.testing.assert_allclose(
                 res[4],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[4], 2),
             )
 
     def test_0d_tensor(self):
@@ -176,30 +177,148 @@ class TestAPISwitchCase(unittest.TestCase):
                 rtol=1e-05,
                 err_msg='result is {} but answer is {}'.format(res[0], 1),
             )
+            self.assertEqual(res[0].shape, ())
             np.testing.assert_allclose(
                 res[1],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[1], 2),
             )
+            self.assertEqual(res[1].shape, ())
             np.testing.assert_allclose(
                 res[2],
                 3,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 3),
+                err_msg='result is {} but answer is {}'.format(res[2], 3),
             )
+            self.assertEqual(res[2].shape, ())
             np.testing.assert_allclose(
                 res[3],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[3], 2),
             )
+            self.assertEqual(res[3].shape, ())
             np.testing.assert_allclose(
                 res[4],
                 2,
                 rtol=1e-05,
-                err_msg='result is {} but answer is {}'.format(res[0], 2),
+                err_msg='result is {} but answer is {}'.format(res[4], 2),
             )
+            self.assertEqual(res[4].shape, ())
+
+    def test_0d_tensor_backward(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program, startup_program):
+            x = paddle.full(shape=[], dtype='float32', fill_value=-2.0)
+            x.stop_gradient = False
+            pred = paddle.full(shape=[], dtype='int32', fill_value=2)
+            # pred is 2, so out = 2 * x
+            out = paddle.static.nn.switch_case(
+                branch_index=pred,
+                branch_fns=[(1, lambda: x), (2, lambda: 2 * x)],
+                default=lambda: -x,
+            )
+            append_backward(out)
+
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
+        exe = fluid.Executor(place)
+
+        res = exe.run(main_program, fetch_list=[out.name, x.grad_name])
+        np.testing.assert_allclose(
+            np.asarray(res[0]), np.array(-4.0), rtol=1e-05
+        )
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(
+            np.asarray(res[1]), np.array(2.0), rtol=1e-05
+        )
+        self.assertEqual(res[1].shape, ())
+
+    def test_0d_tensor_dygraph(self):
+        paddle.disable_static()
+
+        def fn_1():
+            return paddle.full(shape=[], dtype='int32', fill_value=1)
+
+        def fn_2():
+            return paddle.full(shape=[], dtype='int32', fill_value=2)
+
+        def fn_3():
+            return paddle.full(shape=[], dtype='int32', fill_value=3)
+
+        index_1 = paddle.full(shape=[], dtype='int32', fill_value=1)
+        index_2 = paddle.full(shape=[], dtype='int32', fill_value=2)
+        index_5 = paddle.full(shape=[], dtype='int32', fill_value=5)
+
+        # call fn_1
+        out_0 = paddle.static.nn.switch_case(
+            branch_index=index_1, branch_fns={1: fn_1, 2: fn_2, 3: fn_3}
+        )
+
+        # call fn_2 : branch_fns={0: fn_1, 1:fn_2, 2:fn_3}
+        out_1 = paddle.static.nn.switch_case(
+            branch_index=index_1, branch_fns=(fn_1, fn_2, fn_3)
+        )
+
+        # call default fn_3
+        out_2 = paddle.static.nn.switch_case(
+            branch_index=index_5,
+            branch_fns=((1, fn_1), (2, fn_2)),
+            default=fn_3,
+        )
+
+        # no default, call fn_2
+        out_3 = paddle.static.nn.switch_case(
+            branch_index=index_2, branch_fns=[(1, fn_1), (2, fn_2)]
+        )
+
+        # no default, call fn_2 but branch_index is 5
+        out_4 = paddle.static.nn.switch_case(
+            branch_index=index_5,
+            branch_fns=[(1, fn_1), (3, fn_2), (2, fn_3)],
+        )
+        np.testing.assert_allclose(
+            out_0,
+            1,
+            rtol=1e-05,
+            err_msg='result is {} but answer is {}'.format(out_0, 1),
+        )
+        self.assertEqual(out_0.shape, [])
+        np.testing.assert_allclose(
+            out_1,
+            2,
+            rtol=1e-05,
+            err_msg='result is {} but answer is {}'.format(out_1, 2),
+        )
+        self.assertEqual(out_1.shape, [])
+        np.testing.assert_allclose(
+            out_2,
+            3,
+            rtol=1e-05,
+            err_msg='result is {} but answer is {}'.format(out_2, 3),
+        )
+        self.assertEqual(out_2.shape, [])
+        np.testing.assert_allclose(
+            out_3,
+            2,
+            rtol=1e-05,
+            err_msg='result is {} but answer is {}'.format(out_3, 2),
+        )
+        self.assertEqual(out_3.shape, [])
+        np.testing.assert_allclose(
+            out_4,
+            2,
+            rtol=1e-05,
+            err_msg='result is {} but answer is {}'.format(out_4, 2),
+        )
+        self.assertEqual(out_4.shape, [])
+
+        paddle.enable_static()
 
     def test_return_var_tuple(self):
         def fn_1():
@@ -426,18 +545,21 @@ class TestAPISwitchCase_Nested(unittest.TestCase):
                 rtol=1e-05,
                 err_msg='result is {} but answer is {}'.format(res[0], 1),
             )
+            self.assertEqual(res[0].shape, ())
             np.testing.assert_allclose(
                 res[1],
                 2,
                 rtol=1e-05,
                 err_msg='result is {} but answer is {}'.format(res[1], 2),
             )
+            self.assertEqual(res[1].shape, ())
             np.testing.assert_allclose(
                 res[2],
                 3,
                 rtol=1e-05,
                 err_msg='result is {} but answer is {}'.format(res[2], 3),
             )
+            self.assertEqual(res[2].shape, ())
 
 
 # test TypeError and ValueError of api switch_case
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
index a83f5b8e5aa0b15731d338643db310537022305d..e4e12f4387d3264af38dd433e7f43c387a871fd8 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
@@ -22,7 +22,6 @@ import paddle.fluid as fluid
 
 class TensorFillDiagonal_Test(unittest.TestCase):
     def test_dim2_normal(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype(
             'float32'
         )
@@ -44,6 +43,7 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 x = paddle.ones((3, 3), dtype=dtype)
                 x.stop_gradient = False
                 y = x * 2
+                y.retain_grads()
                 y.fill_diagonal_(1, offset=0, wrap=True)
                 loss = y.sum()
                 loss.backward()
@@ -55,10 +55,8 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True,
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_offset(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array([[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype(
             'float32'
         )
@@ -80,6 +78,7 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 x = paddle.ones((3, 3), dtype=dtype)
                 x.stop_gradient = False
                 y = x * 2
+                y.retain_grads()
                 y.fill_diagonal_(1, offset=2, wrap=True)
                 loss = y.sum()
                 loss.backward()
@@ -91,7 +90,6 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True,
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_bool(self):
         expected_np = np.array(
@@ -116,7 +114,6 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 self.assertEqual((x.numpy() == expected_np).all(), True)
 
     def test_dim2_unnormal_wrap(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [
                 [1, 2, 2],
@@ -154,6 +151,7 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 x = paddle.ones((7, 3), dtype=dtype)
                 x.stop_gradient = False
                 y = x * 2
+                y.retain_grads()
                 y.fill_diagonal_(1, offset=0, wrap=True)
                 loss = y.sum()
                 loss.backward()
@@ -165,10 +163,8 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True,
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_dim2_unnormal_unwrap(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [
                 [1, 2, 2],
@@ -206,6 +202,7 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 x = paddle.ones((7, 3), dtype=dtype)
                 x.stop_gradient = False
                 y = x * 2
+                y.retain_grads()
                 y.fill_diagonal_(1, offset=0, wrap=False)
                 loss = y.sum()
                 loss.backward()
@@ -217,10 +214,8 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True,
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_dim_larger2_normal(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [
                 [[1, 2, 2], [2, 2, 2], [2, 2, 2]],
@@ -250,6 +245,7 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                 x = paddle.ones((3, 3, 3), dtype=dtype)
                 x.stop_gradient = False
                 y = x * 2
+                y.retain_grads()
                 y.fill_diagonal_(1, offset=0, wrap=True)
                 loss = y.sum()
                 loss.backward()
@@ -261,7 +257,6 @@ class TensorFillDiagonal_Test(unittest.TestCase):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True,
                 )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 5e2756a8d24248ae6b78b0e52934ed6e718d0c43..419c142b6df5ff54909d62e649471741b48666bd 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -286,7 +286,6 @@ class TestTileDoubleGradCheck(unittest.TestCase):
         gradient_checker.double_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.tile_wrapper, [data], out, x_init=[data_arr], place=place
         )
@@ -318,7 +317,6 @@ class TestTileTripleGradCheck(unittest.TestCase):
         gradient_checker.triple_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.tile_wrapper, [data], out, x_init=[data_arr], place=place
         )
@@ -335,24 +333,26 @@ class TestTileTripleGradCheck(unittest.TestCase):
 class TestTileAPI_ZeroDim(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
         x = paddle.rand([])
         x.stop_gradient = False
 
         out = paddle.tile(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(out.shape, [])
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.grad.shape, [])
 
         out = paddle.tile(x, [3])
+        out.retain_grads()
         out.backward()
         self.assertEqual(out.shape, [3])
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.grad.shape, [3])
 
         out = paddle.tile(x, [2, 3])
+        out.retain_grads()
         out.backward()
         self.assertEqual(out.shape, [2, 3])
         self.assertEqual(x.grad.shape, [])
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index c99e7e0712929eb7d14624406423ac39df62cbf9..e18c0bec99be0e08ffca8457f85611df12b2c00a 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -21,8 +21,6 @@ import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
 
-fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
 unary_api_list = [
     paddle.nn.functional.elu,
     paddle.nn.functional.gelu,
@@ -102,7 +100,9 @@ class TestUnaryAPI(unittest.TestCase):
         for api in unary_api_list:
             x = paddle.rand([])
             x.stop_gradient = False
+            x.retain_grads()
             out = api(x)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -202,7 +202,9 @@ class TestReduceAPI(unittest.TestCase):
             else:
                 x = paddle.rand([])
             x.stop_gradient = False
+            x.retain_grads()
             out = api(x, None)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -291,12 +293,16 @@ class TestBinaryAPI(unittest.TestCase):
             y = paddle.rand([])
             x.stop_gradient = False
             y.stop_gradient = False
+            x.retain_grads()
+            y.retain_grads()
             if isinstance(api, dict):
                 out = api['func'](x, y)
                 out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -312,12 +318,16 @@ class TestBinaryAPI(unittest.TestCase):
             y = paddle.rand([])
             x.stop_gradient = False
             y.stop_gradient = False
+            x.retain_grads()
+            y.retain_grads()
             if isinstance(api, dict):
                 out = api['func'](x, y)
                 out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [2, 3, 4])
@@ -331,6 +341,8 @@ class TestBinaryAPI(unittest.TestCase):
             # 3) x is 0D , y is ND
             x = paddle.rand([])
             y = paddle.rand([2, 3, 4])
+            x.retain_grads()
+            y.retain_grads()
             x.stop_gradient = False
             y.stop_gradient = False
             if isinstance(api, dict):
@@ -339,6 +351,8 @@ class TestBinaryAPI(unittest.TestCase):
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -352,9 +366,11 @@ class TestBinaryAPI(unittest.TestCase):
             # 4) x is 0D , y is scalar
             x = paddle.rand([])
             x.stop_gradient = False
+            x.retain_grads()
             y = 0.5
             if isinstance(api, dict):
                 out = getattr(paddle.Tensor, api['cls_method'])(x, y)
+                out.retain_grads()
                 out.backward()
 
                 self.assertEqual(x.shape, [])
@@ -528,7 +544,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_flip(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.flip(x, axis=[])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.shape, [])
         self.assertEqual(out.shape, [])
@@ -618,7 +636,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_pow_factor(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.pow(x, 2.0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -628,7 +648,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_cast(self):
         x = paddle.full([], 1.0, 'float32')
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.cast(x, 'int32')
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -638,7 +660,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_cumprod(self):
         x = paddle.full([], 1.0, 'float32')
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.cumprod(x, 0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -651,7 +675,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_clip(self):
         x = paddle.uniform([], None, -10, 10)
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.clip(x, -5, 5)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -661,7 +687,9 @@ class TestSundryAPI(unittest.TestCase):
     def test_increment(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.increment(x, 1.0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -694,8 +722,10 @@ class TestSundryAPI(unittest.TestCase):
 
     def test_gather_1D(self):
         x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
+        x.retain_grads()
         index = paddle.full([], 2, 'int64')
         out = paddle.gather(x, index)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -707,8 +737,10 @@ class TestSundryAPI(unittest.TestCase):
         x = paddle.to_tensor(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
         )
+        x.retain_grads()
         index = paddle.full([], 1, 'int64')
         out = paddle.gather(x, index)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [3])
@@ -720,8 +752,10 @@ class TestSundryAPI(unittest.TestCase):
         x = paddle.to_tensor(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
         )
+        x.retain_grads()
         index = paddle.full([], 1, 'int64')
         out = paddle.gather(x, index, axis=1)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [2])
@@ -731,9 +765,11 @@ class TestSundryAPI(unittest.TestCase):
 
     def test_scatter_1D(self):
         x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
+        x.retain_grads()
         index = paddle.full([], 2, 'int64')
         updates = paddle.full([], 4.0)
         out = paddle.scatter(x, index, updates)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [5])
@@ -747,6 +783,7 @@ class TestSundryAPI(unittest.TestCase):
         index = paddle.full([], 1, 'int64')
         updates = paddle.to_tensor([1.0, 2.0, 3.0])
         out = paddle.scatter(x, index, updates)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [2, 3])
@@ -762,10 +799,18 @@ class TestSundryAPI(unittest.TestCase):
         x2.stop_gradient = False
         x3.stop_gradient = False
 
+        x1.retain_grads()
+        x2.retain_grads()
+        x3.retain_grads()
+
         out1 = paddle.diagflat(x1, 1)
         out2 = paddle.diagflat(x2, -1)
         out3 = paddle.diagflat(x3, 0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
         out1.backward()
         out2.backward()
         out3.backward()
@@ -800,8 +845,11 @@ class TestSundryAPI(unittest.TestCase):
     def test_scatter_nd(self):
         index = paddle.to_tensor([3], dtype="int64")
         updates = paddle.full([], 2, dtype='float32')
+        updates.retain_grads()
         updates.stop_gradient = False
+
         out = paddle.scatter_nd(index, updates, [5])
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [5])
@@ -818,6 +866,7 @@ class TestSundryAPI(unittest.TestCase):
 
             x = paddle.randn(())
             x.stop_gradient = False
+            x.retain_grads()
 
             out = paddle.kthvalue(x, 1)
             out[0].backward()
@@ -838,6 +887,7 @@ class TestSundryAPI(unittest.TestCase):
             paddle.set_device(place)
 
             x = paddle.randn(())
+            x.retain_grads()
             x.stop_gradient = False
 
             out = paddle.mode(x)
@@ -854,21 +904,30 @@ class TestSundryAPI(unittest.TestCase):
     def test_flatten(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
 
         start_axis = 0
         stop_axis = -1
 
         out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
         self.assertEqual(x.grad.shape, [])
 
+    def test_histogram(self):
+        x = paddle.rand([])
+        out = paddle.histogram(x, bins=5, min=1, max=5)
+        self.assertEqual(out.shape, [5])
+
     def test_scale(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.scale(x, scale=2.0, bias=1.0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -900,6 +959,34 @@ class TestSundryAPI(unittest.TestCase):
         np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
         np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
 
+    def test_cumsum(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+
+        out1 = paddle.cumsum(x1)
+        out2 = paddle.cumsum(x1, axis=0)
+        out3 = paddle.cumsum(x1, axis=-1)
+
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertTrue(x1.grad.numpy() == 3)
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(out1.grad.shape, [1])
+        self.assertTrue(out1.grad.numpy() == 1)
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertTrue(out2.grad.numpy() == 1)
+        self.assertEqual(out3.shape, [])
+        self.assertEqual(out3.grad.shape, [])
+        self.assertTrue(out3.grad.numpy() == 1)
+
     def test_add_n(self):
         x1 = paddle.rand([])
         x1.stop_gradient = False
@@ -911,6 +998,9 @@ class TestSundryAPI(unittest.TestCase):
         out1 = paddle.add_n(x1)
         out2 = paddle.add_n([x2, x3])
 
+        out1.retain_grads()
+        out2.retain_grads()
+
         out1.backward()
         out2.backward()
 
@@ -928,26 +1018,31 @@ class TestSundryAPI(unittest.TestCase):
     def test_reshape_list(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
 
         out = paddle.reshape(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [])
         self.assertEqual(out.grad.shape, [])
 
         out = paddle.reshape(x, [1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
 
         out = paddle.reshape(x, [-1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
 
         out = paddle.reshape(x, [-1, 1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1, 1])
@@ -955,9 +1050,11 @@ class TestSundryAPI(unittest.TestCase):
 
     def test_reshape_tensor(self):
         x = paddle.rand([1, 1])
+        x.retain_grads()
         x.stop_gradient = False
 
         out = paddle.reshape(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [])
@@ -965,6 +1062,7 @@ class TestSundryAPI(unittest.TestCase):
 
         new_shape = paddle.to_tensor([1, 1, 1], "int32")
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1, 1, 1])
@@ -972,6 +1070,7 @@ class TestSundryAPI(unittest.TestCase):
 
         new_shape = paddle.to_tensor([-1], "int32")
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1])
@@ -979,6 +1078,7 @@ class TestSundryAPI(unittest.TestCase):
 
         new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1, 1])
@@ -1019,6 +1119,7 @@ class TestSundryAPI(unittest.TestCase):
         x = paddle.rand([])
         x.stop_gradient = False
         out = paddle.reverse(x, axis=[])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.shape, [])
         self.assertEqual(out.shape, [])
@@ -1029,9 +1130,14 @@ class TestSundryAPI(unittest.TestCase):
         x2 = paddle.rand([])
         x1.stop_gradient = False
         x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
         out1 = paddle.sort(x1, axis=-1)
         out2 = paddle.sort(x2, axis=0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+
         out1.backward()
         out2.backward()
 
@@ -1051,9 +1157,15 @@ class TestSundryAPI(unittest.TestCase):
         x2 = paddle.rand([])
         x1.stop_gradient = False
         x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
+
         out1 = paddle.argsort(x1, axis=-1)
         out2 = paddle.argsort(x2, axis=0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+
         out1.backward()
         out2.backward()
 
@@ -1075,6 +1187,7 @@ class TestSundryAPI(unittest.TestCase):
         w0 = paddle.rand([])
         x0.stop_gradient = False
         y0.stop_gradient = False
+        y0.retain_grads()
 
         out0 = paddle.lerp(x0, y0, w0)
         out0.backward()
@@ -1089,6 +1202,8 @@ class TestSundryAPI(unittest.TestCase):
         w1 = paddle.rand([])
         x1.stop_gradient = False
         y1.stop_gradient = False
+        x1.retain_grads()
+        y1.retain_grads()
 
         out1 = paddle.lerp(x1, y1, w1)
         out1.backward()
@@ -1103,6 +1218,8 @@ class TestSundryAPI(unittest.TestCase):
         w2 = paddle.rand([])
         x2.stop_gradient = False
         y2.stop_gradient = False
+        x2.retain_grads()
+        y2.retain_grads()
 
         out2 = paddle.lerp(x2, y2, w2)
         out2.backward()
@@ -1120,6 +1237,7 @@ class TestSundryAPI(unittest.TestCase):
 
             x = paddle.randn(())
             x.stop_gradient = False
+            x.retain_grads()
 
             out = paddle.repeat_interleave(x, 2, None)
             out.backward()
@@ -1145,6 +1263,7 @@ class TestSundryAPI(unittest.TestCase):
             dtype='float32',
             stop_gradient=False,
         )
+        logit.retain_grads()
         label = paddle.to_tensor(
             [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32'
         )
@@ -1153,6 +1272,7 @@ class TestSundryAPI(unittest.TestCase):
 
         out0 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_0)
         out1 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_1)
+        out0.retain_grads()
 
         np.testing.assert_array_equal(
             out0.numpy(),
@@ -1168,12 +1288,22 @@ class TestSundryAPI(unittest.TestCase):
         y = paddle.full([], 0.6)
         self.assertFalse(paddle.allclose(x, y))
 
+    def test_equalall(self):
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.equal_all(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
     def test_where(self):
         x1 = paddle.full([], 1)
         x2 = paddle.full([], 2)
         x1.stop_gradient = False
         x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
         out = paddle.where(x1 > x2, x1, x2)
+        out.retain_grads()
         out.backward()
         self.assertEqual(out.shape, [])
         self.assertEqual(out.numpy(), 2)
@@ -1186,9 +1316,12 @@ class TestSundryAPI(unittest.TestCase):
     def test_atan2(self):
         x1 = paddle.full([], 0)
         x2 = paddle.full([], 2)
+        x1.retain_grads()
+        x2.retain_grads()
         x1.stop_gradient = False
         x2.stop_gradient = False
         out = paddle.atan2(x1, x2)
+        out.retain_grads()
         out.backward()
         self.assertEqual(out.shape, [])
         self.assertEqual(out.numpy(), 0)
@@ -1198,6 +1331,31 @@ class TestSundryAPI(unittest.TestCase):
         self.assertEqual(x1.grad.numpy(), 0.5)
         self.assertEqual(x2.grad.numpy(), 0)
 
+    def test_maseked_select(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        mask = paddle.full([], True, dtype='bool')
+        y = paddle.masked_select(x, mask)
+
+        y.retain_grads()
+        y.backward()
+        self.assertEqual(y.shape, [1])
+        self.assertEqual(y.numpy(), x.numpy())
+        self.assertEqual(y.grad.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.numpy(), 1)
+
+    def test_t(self):
+        x = paddle.full([], 2.0)
+        x.stop_gradient = False
+        x.retain_grads()
+        out = paddle.t(x)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
 
 class TestSundryAPIStatic(unittest.TestCase):
     def setUp(self):
@@ -1516,6 +1674,16 @@ class TestSundryAPIStatic(unittest.TestCase):
         self.assertEqual(res[1].shape, ())
         self.assertEqual(res[2].shape, (1,))
 
+    @prog_scope()
+    def test_histogram(self):
+        x = paddle.full([], 1, 'float32')
+        out = paddle.histogram(x, bins=5, min=1, max=5)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out])
+
+        self.assertEqual(res[0].shape, (5,))
+
     @prog_scope()
     def test_scale(self):
         x = paddle.rand([])
@@ -1560,6 +1728,45 @@ class TestSundryAPIStatic(unittest.TestCase):
         np.testing.assert_array_equal(out3_2, np.asarray(1))
 
     @prog_scope()
+    def test_cumsum(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+
+        out1 = paddle.cumsum(x1)
+        out2 = paddle.cumsum(x1, axis=0)
+        out3 = paddle.cumsum(x1, axis=-1)
+
+        paddle.static.append_backward(out1.sum())
+        paddle.static.append_backward(out2.sum())
+        paddle.static.append_backward(out3.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                x1.grad_name,
+                out1.grad_name,
+                out2.grad_name,
+                out3.grad_name,
+            ],
+        )
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1)
+        self.assertEqual(res[4].shape, (1,))
+        self.assertEqual(res[4], 1)
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[5], 1)
+        self.assertEqual(res[6].shape, ())
+        self.assertEqual(res[6], 1)
+        self.assertEqual(out2.shape, ())
+        self.assertEqual(out3.shape, ())
+
     def test_add_n(self):
         x1 = paddle.rand([])
         x1.stop_gradient = False
@@ -1868,6 +2075,37 @@ class TestSundryAPIStatic(unittest.TestCase):
 
         self.assertEqual(res[0].shape, ())
 
+    @prog_scope()
+    def test_maseked_select(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        mask = paddle.full([], True, dtype='bool')
+        y = paddle.masked_select(x, mask)
+        paddle.static.append_backward(y.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, y, y.grad_name, x.grad_name])
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[1], res[0])
+        self.assertEqual(res[2].shape, (1,))
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1)
+
+    @prog_scope()
+    def test_t(self):
+        x = paddle.full([], 2.0)
+        x.stop_gradient = False
+        out = paddle.t(x)
+        paddle.static.append_backward(out.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, feed={}, fetch_list=[out, out.grad_name, x.grad_name]
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
index afaf3b2a52fab7551d7b9b0338b4f4cdad66cb4f..95934caf52b34040619baabc287abd201b2a6ccd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -228,7 +228,8 @@ def get_xpu_op_support_types(op_name, dev_id=0):
         op_name_type = op_name + "_" + stype
         if op_name_type in ops:
             support_types.append(stype)
-
+    if len(support_types) == 0:
+        print("WARNING: support_types is EMPTY for op", op_name)
     return support_types
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
index 6f112a76204c93506340f511ccddf061b5e5fabe..f7833199180ecc356adcd238b7d600785a0edf20 100644
--- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
@@ -53,28 +53,26 @@ class TestProcessGroupFp32(unittest.TestCase):
         )
         sys.stdout.write("rank {}: test new group api ok\n".format(pg.rank()))
 
+        # TODO(zhangxiaoci) allreduce unittest raise error
         # test allreduce sum
         # rank 0
-        x = np.random.random(self.shape).astype(self.dtype)
-        tensor_x = paddle.to_tensor(x)
+        # x = np.random.random(self.shape).astype(self.dtype)
+        # tensor_x = paddle.to_tensor(x)
         # rank 1
-        y = np.random.random(self.shape).astype(self.dtype)
-        tensor_y = paddle.to_tensor(y)
+        # y = np.random.random(self.shape).astype(self.dtype)
+        # tensor_y = paddle.to_tensor(y)
 
-        sum_result = tensor_x + tensor_y
-        if pg.rank() == 0:
-            task = dist.all_reduce(tensor_x)
-            assert np.array_equal(tensor_x, sum_result)
-        else:
-            task = dist.all_reduce(tensor_y)
-            assert np.array_equal(tensor_y, sum_result)
-
-        sys.stdout.write(
-            "rank {}: test allreduce sum api ok\n".format(pg.rank())
-        )
+        # sum_result = tensor_x + tensor_y
+        # if pg.rank() == 0:
+        #    task = dist.all_reduce(tensor_x)
+        #    assert np.array_equal(tensor_x, sum_result)
+        # else:
+        #    task = dist.all_reduce(tensor_y)
+        #    assert np.array_equal(tensor_y, sum_result)
 
-        # TODO
-        # test allreduce max/min/prod
+        # sys.stdout.write(
+        #    "rank {}: test allreduce sum api ok\n".format(pg.rank())
+        # )
 
         # test broadcast
         # rank 0
@@ -178,6 +176,52 @@ class TestProcessGroupFp32(unittest.TestCase):
         assert np.array_equal(tensor_y, old_tensor_y)
         sys.stdout.write("rank {}: test reduce sum api ok\n".format(pg.rank()))
 
+        # test send async api
+        # rank 0
+        x = np.random.random(self.shape).astype(self.dtype)
+        tensor_x = paddle.to_tensor(x)
+        # rank 1
+        y = np.random.random(self.shape).astype(self.dtype)
+        tensor_y = paddle.to_tensor(y)
+
+        if pg.rank() == 0:
+            task = dist.send(tensor_x, 1, sync_op=False)
+            task.wait()
+        else:
+            task = dist.recv(tensor_y, 0, sync_op=False)
+            task.wait()
+            assert np.array_equal(tensor_y, tensor_x)
+
+        # test send sync api
+        # rank 0
+        x = np.random.random(self.shape).astype(self.dtype)
+        tensor_x = paddle.to_tensor(x)
+        # rank 1
+        y = np.random.random(self.shape).astype(self.dtype)
+        tensor_y = paddle.to_tensor(y)
+
+        if pg.rank() == 0:
+            task = dist.send(tensor_x, 1, sync_op=True)
+        else:
+            task = dist.recv(tensor_y, 0, sync_op=True)
+            assert np.array_equal(tensor_y, tensor_x)
+
+        # test send 0-d tensor
+        # rank 0
+        x = np.random.uniform(-1, 1, []).astype(self.dtype)
+        tensor_x = paddle.to_tensor(x)
+        # rank 1
+        y = np.array(0.2022).astype(self.dtype)
+        tensor_y = paddle.to_tensor(y)
+
+        if pg.rank() == 0:
+            task = dist.send(tensor_x, 1, sync_op=True)
+        else:
+            task = dist.recv(tensor_y, 0, sync_op=True)
+            assert np.array_equal(tensor_y, tensor_x) and tensor_y.shape == []
+
+        sys.stdout.write("rank {}: test send api ok\n".format(pg.rank()))
+
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
index 0491e7ef5f1d5bc2798c0fce1bc29f7e4fef266e..62120c0d1be8d6155ec1c282f90fb1331b8c70ce 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
@@ -46,6 +46,11 @@ class XPUTestLogicalAnd(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = 'logical_and'
 
+            # special range for bool dtype
+            if self.dtype == np.dtype(np.bool):
+                self.low = 0
+                self.high = 2
+
             x = np.random.randint(
                 self.low, self.high, self.x_shape, dtype=self.dtype
             )
@@ -62,7 +67,7 @@ class XPUTestLogicalAnd(XPUOpTestWrapper):
             self.outputs = {'Out': out}
 
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [2, 3, 4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -76,7 +81,7 @@ class XPUTestLogicalAnd(XPUOpTestWrapper):
 
     class XPUTestLogicalAndCase1(XPUTestLogicalAndBase):
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -102,6 +107,11 @@ class XPUTestLogicalOr(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = 'logical_or'
 
+            # special range for bool dtype
+            if self.dtype == np.dtype(np.bool):
+                self.low = 0
+                self.high = 2
+
             x = np.random.randint(
                 self.low, self.high, self.x_shape, dtype=self.dtype
             )
@@ -118,7 +128,7 @@ class XPUTestLogicalOr(XPUOpTestWrapper):
             self.outputs = {'Out': out}
 
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [2, 3, 4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -132,7 +142,7 @@ class XPUTestLogicalOr(XPUOpTestWrapper):
 
     class XPUTestLogicalOrCase1(XPUTestLogicalOrBase):
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -158,6 +168,11 @@ class XPUTestLogicalXor(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = 'logical_xor'
 
+            # special range for bool dtype
+            if self.dtype == np.dtype(np.bool):
+                self.low = 0
+                self.high = 2
+
             x = np.random.randint(
                 self.low, self.high, self.x_shape, dtype=self.dtype
             )
@@ -174,7 +189,7 @@ class XPUTestLogicalXor(XPUOpTestWrapper):
             self.outputs = {'Out': out}
 
         def init_case(self):
-            self.dtype = np.int64
+            self.dtype = self.in_type
             self.x_shape = [2, 3, 4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -188,7 +203,7 @@ class XPUTestLogicalXor(XPUOpTestWrapper):
 
     class XPUTestLogicalXorCase1(XPUTestLogicalXorBase):
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [4, 5]
             self.y_shape = [2, 3, 4, 5]
             self.low = -100
@@ -214,6 +229,11 @@ class XPUTestLogicalNot(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = 'logical_not'
 
+            # special range for bool dtype
+            if self.dtype == np.dtype(np.bool):
+                self.low = 0
+                self.high = 2
+
             x = np.random.randint(
                 self.low, self.high, self.x_shape, dtype=self.dtype
             )
@@ -224,7 +244,7 @@ class XPUTestLogicalNot(XPUOpTestWrapper):
             self.outputs = {'Out': out}
 
         def init_case(self):
-            self.dtype = np.int32
+            self.dtype = self.in_type
             self.x_shape = [2, 3, 4, 5]
             self.low = -100
             self.high = 100
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
old mode 100644
new mode 100755
index d8ca8978843cea2e74e410548f478e5fbc7239dc..a9d95fc963ce338dd06787d13ea26514dc9b4855
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -477,7 +477,7 @@ class TestSundryAPI(unittest.TestCase):
         self.assertEqual(x.grad.shape, [2, 3])
         self.assertEqual(out.grad.shape, [2])
 
-    def test_scatter_1D(self):
+    def _test_scatter_1D(self):
         x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
         index = paddle.full([], 2, 'int64')
         updates = paddle.full([], 4.0)
@@ -488,7 +488,7 @@ class TestSundryAPI(unittest.TestCase):
         self.assertEqual(out.numpy()[2], 4)
         self.assertEqual(out.grad.shape, [5])
 
-    def test_scatter_XD(self):
+    def _test_scatter_XD(self):
         x = paddle.to_tensor(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
         )
@@ -592,6 +592,29 @@ class TestSundryAPI(unittest.TestCase):
         np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
         np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
 
+    def test_cumsum(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+
+        out1 = paddle.cumsum(x1)
+        out2 = paddle.cumsum(x1, axis=0)
+        out3 = paddle.cumsum(x1, axis=-1)
+
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(out1.grad.shape, [1])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertEqual(out3.shape, [])
+        self.assertEqual(out3.grad.shape, [])
+
     def test_add_n(self):
         x1 = paddle.rand([])
         x1.stop_gradient = False
@@ -780,6 +803,27 @@ class TestSundryAPI(unittest.TestCase):
         y = paddle.full([], 0.6)
         self.assertFalse(paddle.allclose(x, y))
 
+    def test_equalall(self):
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.equal_all(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
+    def test_maseked_select(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        mask = paddle.full([], True, dtype='bool')
+        y = paddle.masked_select(x, mask)
+
+        y.retain_grads()
+        y.backward()
+        self.assertEqual(y.shape, [1])
+        self.assertEqual(y.numpy(), x.numpy())
+        self.assertEqual(y.grad.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.numpy(), 1)
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 
diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index 456ac20db2e5e1b24039403c3d515474e1ccb730..23bf8f0f7e3bff0e51eea216aac5d322099b1121 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -29,7 +29,9 @@ def _composite(op, *args):
 @REGISTER_COMPOSITE('softmax')
 def softmax_composite(x, axis):
     """define composite rule of op softmax"""
-    molecular = exp(x)
-    denominator = broadcast_to(sum(molecular, axis=axis, keepdim=True), x.shape)
+    max_temp = max(x, axis, keepdim=True)
+    max_temp.stop_gradient = True
+    molecular = exp(x - max_temp)
+    denominator = sum(molecular, axis=axis, keepdim=True)
     res = divide(molecular, denominator)
     return res
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 7cfabdd9e5551627ba6ce55e81486702fffb032d..76e0802194272927c6318bba7def02e67314cdfd 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -16,7 +16,7 @@ import logging
 import typing
 
 import paddle
-from paddle.fluid import backward, framework
+from paddle.fluid import backward, core, framework
 from paddle.incubate.autograd import primx, utils
 
 
@@ -218,13 +218,22 @@ def grad(outputs, inputs, grad_outputs=None):
 @framework.static_only
 def to_prim(blocks):
     """Search nonbasic ops which have be registered composite rules and replace them with primitive ops."""
+    if not core.enable_prim_forward():
+        return
     if isinstance(blocks, paddle.fluid.framework.Block):
         logging.info("Atomize composite op to primitive ops begin.")
-        primx._lower_composite(blocks)
-        return
+        main_program = blocks.program
     elif isinstance(blocks, typing.Sequence):
         for item in blocks:
-            to_prim(item)
-        return
+            if not isinstance(item, paddle.fluid.framework.Block):
+                raise TypeError(
+                    f"Expect block or sequence of blocks, but sequence contains {type(item)}."
+                )
+        main_program = blocks[0].program
     else:
-        raise TypeError
+        raise TypeError(
+            f"Expect block or sequence of blocks, but got {type(blocks)}."
+        )
+    with framework.program_guard(main_program):
+        primx._lower_composite(blocks)
+    return
diff --git a/python/paddle/incubate/autograd/primitives.py b/python/paddle/incubate/autograd/primitives.py
index 371746bf349c926fabf831a7dcf583f911de2eb5..a9ec324c05a7a1fdb36b4a7849689d4900208c52 100644
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
@@ -39,6 +39,8 @@ from paddle.tensor import log1p  # noqa: F401
 from paddle.tensor import logcumsumexp  # noqa: F401
 from paddle.tensor import logit  # noqa: F401
 from paddle.tensor import logsumexp  # noqa: F401
+from paddle.tensor import max  # noqa: F401
+from paddle.tensor import min  # noqa: F401
 from paddle.tensor import multiply  # noqa: F401
 from paddle.tensor import pow  # noqa: F401
 from paddle.tensor import prod  # noqa: F401
@@ -73,6 +75,8 @@ math_op = [
     'logsumexp',
     'logcumsumexp',
     'logit',
+    'max',
+    'min',
 ]
 
 trigonometric_op = [
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 6f2d4d9d5213c768aad2e354c247cf5968ef6c2d..c472137ab71691e87bb1e138cf39a143055b89dc 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+import typing
 from collections import OrderedDict
 
 import paddle
@@ -575,90 +577,101 @@ def _lower_composite(block, blacklist=[]):
                 return_list.append(x)
         return return_list
 
-    # Step1: Do some preparatory work for lower
-    lower_fn = _composite
-    lookup_fn = lookup_composite
-
-    value_table = {}
-    to_bind = {}
-    to_bind_rev = {}
-    for var in block.desc.all_vars():
-        value_table[var.name()] = block.var(var.name())
-
-    ops_to_remove = []
-    vars_to_remove = set()
-
-    # Step2: Process all ops in the target block
-    for op_idx in range(len(block.ops)):
-        op = block.ops[op_idx]
-        ops_to_remove.append(op_idx)
-        if lookup_fn(op.type) is not None and op.type not in blacklist:
-            input_args = prepare_python_api_arguments(op)
-            bind(input_args, to_bind, value_table)
-
-            for orig_out, new_out in zip(
-                expand_nested_list(get_output_var_list(op)),
-                expand_nested_list(as_tensors(lower_fn(op, *input_args))),
-            ):
-                assert not (orig_out is None) ^ (
-                    new_out is None
-                ), "orig_out and new_out should match."
-                vars_to_remove.add(new_out.name)
-                value_table[new_out.name] = new_out
-                to_bind[orig_out.name] = new_out.name
-                to_bind_rev[new_out.name] = orig_out.name
-        else:
-            inputs = {}
-            for i in range(len(op.input_names)):
-                inputs[op.input_names[i]] = bind_name(
-                    op.input(op.input_names[i]), to_bind
-                )
-
-            outputs = {}
-            for i in range(len(op.output_names)):
-                outputs[op.output_names[i]] = op.output(op.output_names[i])
-
-            attrs = {}
-            for name in sorted(op.attr_names):
-                attrs[name] = op.attr(name)
-            from paddle.fluid.dygraph.base import param_guard
-
-            new_op_desc = block.desc.append_op()
-            with param_guard(inputs), param_guard(outputs):
-                op = Operator(
-                    block=block,
-                    desc=new_op_desc,
-                    type=op.type,
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs,
-                )
-            block.ops.append(op)
-
-    # Step3: Do some post-processing work
-    for op_idx in reversed(ops_to_remove):
-        block.desc._remove_op(op_idx, op_idx + 1)
-        del block.ops[op_idx]
-    block._sync_with_cpp()
-
-    for op_idx in range(len(block.ops)):
-        op = block.ops[op_idx]
-        for in_name in op.input_arg_names:
-            if in_name in to_bind_rev:
-                op._rename_input(in_name, to_bind_rev[in_name])
-
-        for out_name in op.output_arg_names:
-            if out_name in to_bind_rev:
-                op._rename_output(out_name, to_bind_rev[out_name])
+    if isinstance(block, paddle.fluid.framework.Block):
+        logging.info("Atomize composite op to primitive ops begin.")
+
+        # Step1: Do some preparatory work for lower
+        lower_fn = _composite
+        lookup_fn = lookup_composite
+
+        value_table = {}
+        to_bind = {}
+        to_bind_rev = {}
+        for var in block.desc.all_vars():
+            value_table[var.name()] = block.var(var.name())
+
+        ops_to_remove = []
+        vars_to_remove = set()
+
+        # Step2: Process all ops in the target block
+        for op_idx in range(len(block.ops)):
+            op = block.ops[op_idx]
+            ops_to_remove.append(op_idx)
+            if lookup_fn(op.type) is not None and op.type not in blacklist:
+                input_args = prepare_python_api_arguments(op)
+                bind(input_args, to_bind, value_table)
+
+                for orig_out, new_out in zip(
+                    expand_nested_list(get_output_var_list(op)),
+                    expand_nested_list(as_tensors(lower_fn(op, *input_args))),
+                ):
+                    assert not (orig_out is None) ^ (
+                        new_out is None
+                    ), "orig_out and new_out should match."
+                    vars_to_remove.add(new_out.name)
+                    value_table[new_out.name] = new_out
+                    to_bind[orig_out.name] = new_out.name
+                    to_bind_rev[new_out.name] = orig_out.name
+            else:
+                inputs = {}
+                for i in range(len(op.input_names)):
+                    inputs[op.input_names[i]] = bind_name(
+                        op.input(op.input_names[i]), to_bind
+                    )
+
+                outputs = {}
+                for i in range(len(op.output_names)):
+                    outputs[op.output_names[i]] = op.output(op.output_names[i])
+
+                attrs = {}
+                for name in sorted(op.attr_names):
+                    attrs[name] = op.attr(name)
+                from paddle.fluid.dygraph.base import param_guard
+
+                new_op_desc = block.desc.append_op()
+                with param_guard(inputs), param_guard(outputs):
+                    op = Operator(
+                        block=block,
+                        desc=new_op_desc,
+                        type=op.type,
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                    )
+                block.ops.append(op)
+
+        # Step3: Do some post-processing work
+        for op_idx in reversed(ops_to_remove):
+            block.desc._remove_op(op_idx, op_idx + 1)
+            del block.ops[op_idx]
+        block._sync_with_cpp()
 
-    for var_name in sorted(vars_to_remove):
-        assert (
-            var_name in to_bind_rev
-        ), 'var_name "{}" is not in to_bind_rev.'.format(var_name)
-        if var_name != to_bind_rev[var_name]:
-            block.desc._remove_var(var_name.encode())
-            del block.vars[var_name]
-    block._sync_with_cpp()
+        for op_idx in range(len(block.ops)):
+            op = block.ops[op_idx]
+            for in_name in op.input_arg_names:
+                if in_name in to_bind_rev:
+                    op._rename_input(in_name, to_bind_rev[in_name])
+
+            for out_name in op.output_arg_names:
+                if out_name in to_bind_rev:
+                    op._rename_output(out_name, to_bind_rev[out_name])
+
+        for var_name in sorted(vars_to_remove):
+            assert (
+                var_name in to_bind_rev
+            ), 'var_name "{}" is not in to_bind_rev.'.format(var_name)
+            if var_name != to_bind_rev[var_name]:
+                block.desc._remove_var(var_name.encode())
+                del block.vars[var_name]
+        block._sync_with_cpp()
+        return
+
+    elif isinstance(block, typing.Sequence):
+        for item in block:
+            _lower_composite(item)
+        return
+    else:
+        raise TypeError
 
 
 @framework.static_only
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 040ef36854f3fb9db4d0684cd861fd3febba26f0..293c8b40f7752bffa3819e459a929052780cff0e 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -32,6 +32,7 @@ from paddle.fluid.layers.utils import _hash_with_id, flatten, pack_sequence_as
 
 from . import logging_utils
 from .return_transformer import RETURN_NO_VALUE_MAGIC_NUM
+from .utils import _out_grad_names, _param_grad_names
 
 __all__ = []
 
@@ -206,10 +207,6 @@ class PartialProgramLayer:
         else:
             return core.Scope()
 
-    @LazyInitialized
-    def __fake_vars(self):
-        return _create_fake_var()
-
     @LazyInitialized
     def _double_grads(self):
         return self._get_double_grads(self._origin_main_program)
@@ -379,46 +376,15 @@ class PartialProgramLayer:
 
     @LazyInitialized
     def _param_grad_names(self):
-        names = []
-        # NOTE: `names` and `self._params` must be in the same order so that
-        # the param grad name can be set correctly in the run_program.
-        for param in self._params:
-            candidate = [
-                var_name
-                for var_name in self._train_program.block(0).vars.keys()
-                if var_name.endswith(param.name + '@GRAD')
-            ]
-            if candidate:
-                names.append(
-                    max(candidate, key=lambda name: name.count('grad/'))
-                )
-            else:
-                names.append(param.name + '@GRAD')
-        return names
+        return _param_grad_names(self._train_program.desc, self._params)
 
     @LazyInitialized
     def _out_grad_names(self):
-        """
-        Parse Out@GARD name from original train and infer program.
-        """
-        names = []
-        origin_infer_program = self._create_program(is_infer_mode=True)
-        origin_train_program = self._train_program
-        fwd_end_op_index = len(origin_infer_program.block(0).ops)
-        for i in range(
-            fwd_end_op_index + 1,
-            min(
-                fwd_end_op_index + 2 * len(self._outputs.var_ids),
-                len(origin_train_program.block(0).ops),
-            ),
-            2,
-        ):
-            op = origin_train_program.block(0).ops[i]
-            if op.type == 'fill_constant':
-                var_name = op.output('Out')[0]
-                names.append(var_name)
-
-        return names
+        return _out_grad_names(
+            self._train_program.desc,
+            self._create_program(is_infer_mode=True).desc.block(0).op_size(),
+            len(self._outputs.var_ids),
+        )
 
     @property
     def program(self):
@@ -604,8 +570,14 @@ class PartialProgramLayer:
             if isinstance(out, framework.Variable):
                 targets.append(program.global_block().var(out.name))
 
-        if targets and self._params:
-            backward.gradients(targets=targets, inputs=[])
+        if targets:
+            enable_prim = self._build_strategy.build_cinn_pass
+            if enable_prim and core.enable_prim_backward():
+                core.set_prim_enabled(True)
+                backward.gradients(targets=targets, inputs=[])
+                core.set_prim_enabled(False)
+            else:
+                backward.gradients(targets=targets, inputs=[])
 
         start_idx = len(main_program.block(0).ops) + 2 * len(
             self._outputs.tolist()
@@ -647,7 +619,7 @@ class PartialProgramLayer:
                 if "@GRAD" in name:
                     var_desc = block.vars[name].desc
                     var_base = None
-                    if not framework._in_eager_mode_:
+                    if not framework.global_var._in_eager_mode_:
                         var_base = core.VarBase(
                             var_desc.dtype(),
                             var_desc.shape(),
@@ -902,7 +874,7 @@ class PartialProgramLayer:
         for i, value in enumerate(flatten_inputs):
             if isinstance(value, np.ndarray):
                 var = None
-                if not framework._in_eager_mode_:
+                if not framework.global_var._in_eager_mode_:
                     var = core.VarBase(
                         value=value,
                         name=self._inputs[i].desc.name(),
@@ -946,7 +918,7 @@ class PartialProgramLayer:
             if var_desc.name() in out_varbase_map:
                 return out_varbase_map[var_desc.name()]
 
-            if not framework._in_eager_mode_:
+            if not framework.global_var._in_eager_mode_:
                 var_base = core.VarBase(
                     var_desc.dtype(),
                     var_desc.shape(),
@@ -977,7 +949,7 @@ class PartialProgramLayer:
         inner_scope = self._get_scope(
             program_id=program_id, use_scope_cache=use_scope_cache
         )
-        if not framework._in_eager_mode_:
+        if not framework.global_var._in_eager_mode_:
             tmp_scope_vec = core.VarBase(
                 core.VarDesc.VarType.FP32,
                 [],
@@ -1123,19 +1095,14 @@ class PartialProgramLayer:
                         )
 
     def _valid_vars(self, vars):
-        """
-        Note: run_program_op.InferShape requires `X`/'Out' not be null.
-        But it's common in dy2static, fake varBase is created to handle the
-        problem.
-        """
-        return vars if vars else self.__fake_vars
+        return vars if vars else None
 
 
 def _create_fake_var():
     """
     Create a fake_var (force on CPU) to handle empty input or output
     """
-    if not framework._in_eager_mode_:
+    if not framework.global_var._in_eager_mode_:
         return [
             core.VarBase(
                 core.VarDesc.VarType.FP32,
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 7fd6b0ce7fe004819e210bd01e158e56a27e574d..5b8493977e904b9fefdb4ae448b8df1abd499e13 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -18,7 +18,7 @@ import textwrap
 import threading
 import weakref
 
-from paddle.fluid import _non_static_mode, framework
+from paddle.fluid import _non_static_mode, core, framework
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import param_guard, switch_to_static_graph
@@ -930,6 +930,13 @@ class ConcreteProgram:
         self.function = function
         self.kwargs = kwargs
 
+    @switch_to_static_graph
+    def _to_prim(self):
+        # TODO(Aurelius84): Fix this cycle import problem
+        from paddle.incubate.autograd.primapi import to_prim
+
+        to_prim(self.main_program.blocks)
+
     @staticmethod
     @switch_to_static_graph
     def from_func_spec(
@@ -1083,6 +1090,11 @@ class ProgramCache:
         self._recent_cache_key = None
 
     def _build_once(self, cache_key):
+        # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim
+        enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass
+        if enable_prim and core.enable_prim_backward():
+            core.set_prim_enabled(True)
+
         concrete_program = ConcreteProgram.from_func_spec(
             func_spec=cache_key.function_spec,
             input_spec=cache_key.input_args_with_spec,
@@ -1090,6 +1102,10 @@ class ProgramCache:
             class_instance=cache_key.class_instance,
             **cache_key.kwargs
         )
+
+        if enable_prim or core.enable_prim_forward() == "debug":
+            concrete_program._to_prim()
+            core.set_prim_enabled(False)
         return concrete_program, partial_program_from(concrete_program)
 
     def __getitem__(self, item):
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 4d74c629a392683bd63e71ef2a5806f186354daf..4397728576ba755a618f706954cda16b45d3f6aa 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -1483,3 +1483,41 @@ def create_name_str(name_ids):
 
     names_str = ["'%s'" % (name.replace("'", "\\'")) for name in name_ids]
     return "(%s, )" % ','.join(names_str)
+
+
+def _param_grad_names(program_desc, params):
+    """
+    Parse PARAM@GARD name from original train and infer program.
+    """
+    names = []
+    # NOTE: `names` and `self._params` must be in the same order so that
+    # the param grad name can be set correctly in the run_program.
+    for param in params:
+        candidate = [
+            var.name()
+            for var in program_desc.block(0).all_vars()
+            if var.name().endswith(param.name + '@GRAD')
+        ]
+        if candidate:
+            names.append(max(candidate, key=lambda name: name.count('grad/')))
+        else:
+            names.append(param.name + '@GRAD')
+
+    return names
+
+
+def _out_grad_names(program_desc, fwd_end_op_index, out_size):
+    """
+    Parse Out@GARD name from original train and infer program.
+    """
+    names = []
+    for i in range(
+        fwd_end_op_index + 1,
+        min(fwd_end_op_index + 2 * out_size, program_desc.block(0).op_size()),
+        2,
+    ):
+        op = program_desc.block(0).op(i)
+        if op.type() == 'fill_constant':
+            var_name = op.output('Out')[0]
+            names.append(var_name)
+    return names
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index 9cd30545af8450343981aeb020f451b6c2f235bb..c488c758f4a262fdb4537f8d9a2c26d6248e4bbe 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -33,6 +33,8 @@ from paddle.jit.dy2static.partial_program import (
     add_build_strategy_for,
 )
 
+from .dy2static.utils import _out_grad_names, _param_grad_names
+
 __all__ = []
 
 INFER_MODEL_SUFFIX = ".pdmodel"
@@ -887,28 +889,7 @@ def _construct_params_and_buffers(
 
 
 def _valid_vars(vars):
-    if vars:
-        return vars
-    if framework._in_eager_without_dygraph_check():
-        return [
-            core.eager.Tensor(
-                core.VarDesc.VarType.FP32,
-                [],
-                "Fake_var",
-                core.VarDesc.VarType.RAW,
-                False,
-            )
-        ]
-    else:
-        return [
-            core.VarBase(
-                core.VarDesc.VarType.FP32,
-                [],
-                "Fake_var",
-                core.VarDesc.VarType.RAW,
-                False,
-            )
-        ]
+    return vars if vars else None
 
 
 def _run_dygraph(instance, input, program_holder):
@@ -1041,6 +1022,15 @@ def _run_dygraph(instance, input, program_holder):
         'program_id',
         _hash_with_id(trace_program, instance),
     ]
+    if not instance._is_test:
+        attrs.extend(
+            (
+                'param_grad_names',
+                _param_grad_names(trace_program, persistable_vars),
+                'out_grad_names',
+                _out_grad_names(trace_program, end_op_index, len(output_vars)),
+            )
+        )
 
     use_interpretorcore = (
         _is_enable_standalone_executor()
diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py
index 01832fd536769bff0b5613d399d35d61131cad5c..795e49698f34462b4182a69d49e4275db94c70d7 100644
--- a/python/paddle/static/amp/__init__.py
+++ b/python/paddle/static/amp/__init__.py
@@ -19,9 +19,3 @@ from .fp16_lists import CustomOpLists, AutoMixedPrecisionLists
 from . import fp16_utils
 from .fp16_utils import fp16_guard, cast_model_to_fp16, cast_parameters_to_fp16
 from . import bf16
-from .bf16 import bf16_guard
-
-__all__ = []
-__all__ += decorator.__all__
-__all__ += fp16_lists.__all__
-__all__ += fp16_utils.__all__
diff --git a/python/paddle/static/amp/amp_nn.py b/python/paddle/static/amp/amp_nn.py
index c5e812c141d3696459c6b040c30fe7e39f02629c..0f936ae8f57b9e9731173aac4efa44593993765d 100644
--- a/python/paddle/static/amp/amp_nn.py
+++ b/python/paddle/static/amp/amp_nn.py
@@ -18,8 +18,6 @@ from paddle.fluid.data_feeder import check_type, check_variable_and_dtype
 from paddle.fluid.framework import Variable, in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
 
-__all__ = ['check_finite_and_unscale', 'update_loss_scaling']
-
 
 def check_finite_and_unscale(x, scale, name=None, float_status=None):
     """
diff --git a/python/paddle/static/amp/bf16/__init__.py b/python/paddle/static/amp/bf16/__init__.py
index 82b616b299447c2a297d2dd5e718e8ef4a09b085..fad4a654fd88b8743720a0e469d29c95ad4a1462 100644
--- a/python/paddle/static/amp/bf16/__init__.py
+++ b/python/paddle/static/amp/bf16/__init__.py
@@ -24,8 +24,3 @@ from .amp_utils import (
 )
 from . import decorator
 from .decorator import decorate_bf16
-
-__all__ = []
-__all__ += decorator.__all__
-__all__ += amp_lists.__all__
-__all__ += amp_utils.__all__
diff --git a/python/paddle/static/amp/bf16/amp_lists.py b/python/paddle/static/amp/bf16/amp_lists.py
index d1878a3367fbc7a72a9cc90cea7028a6893ce37d..5ea5beb708b89414f6aa468f18be8dc28e073277 100644
--- a/python/paddle/static/amp/bf16/amp_lists.py
+++ b/python/paddle/static/amp/bf16/amp_lists.py
@@ -20,8 +20,6 @@ from ..fp16_lists import black_list as black_list_fp16
 from ..fp16_lists import gray_list as gray_list_fp16
 from ..fp16_lists import white_list as white_list_fp16
 
-__all__ = ["AutoMixedPrecisionListsBF16"]
-
 
 class AutoMixedPrecisionListsBF16:
     """
diff --git a/python/paddle/static/amp/bf16/amp_utils.py b/python/paddle/static/amp/bf16/amp_utils.py
index cf8c82127b3b45f29559444a65bf9847124ccaf3..f9a813aa44d41ccff8a035b9832fa5705fd6e17d 100644
--- a/python/paddle/static/amp/bf16/amp_utils.py
+++ b/python/paddle/static/amp/bf16/amp_utils.py
@@ -31,14 +31,6 @@ from ..fp16_utils import (
 )
 from .amp_lists import AutoMixedPrecisionListsBF16
 
-__all__ = [
-    "bf16_guard",
-    "rewrite_program_bf16",
-    "cast_model_to_bf16",
-    "cast_parameters_to_bf16",
-    "convert_float_to_uint16",
-]
-
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
diff --git a/python/paddle/static/amp/bf16/decorator.py b/python/paddle/static/amp/bf16/decorator.py
index 20286d3eebca5fead08b8e0e1f291478e0bb2080..66963e25634f09a1f73aed6df7945d726c0b9e40 100644
--- a/python/paddle/static/amp/bf16/decorator.py
+++ b/python/paddle/static/amp/bf16/decorator.py
@@ -25,8 +25,6 @@ from .amp_utils import (
     rewrite_program_bf16,
 )
 
-__all__ = ["decorate_bf16"]
-
 
 class OptimizerWithMixedPrecision:
     """
diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py
index ba33f6b391b0b825ad87ae6ecbf1a14778cf4b2e..827a3a8b599f87eed1a4ed3b66f5157915f6fbe4 100644
--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -34,8 +34,6 @@ from .fp16_utils import (
     update_role_var_grad,
 )
 
-__all__ = ["decorate"]
-
 
 class OptimizerWithMixedPrecision:
     """
diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py
index b2acd0bb5156ddab063a02304dd09d199ee38c80..b3f9b0331a86c19577a09e13b54db9f6aeb57749 100644
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -16,8 +16,6 @@ import copy
 
 from paddle.fluid import core
 
-__all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
-
 # lookup_table fp16 is slower than fp32, though fp16 is supported.
 _extra_unsupported_fp16_list = {
     'lookup_table',
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index c9cee2ab8d25cf96909896453703acfb483087e7..281d3638ee261c9bf8dd53e1c7feee1c50968545 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -23,8 +23,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
 from .fp16_lists import AutoMixedPrecisionLists
 
-__all__ = ["fp16_guard", "cast_model_to_fp16", "cast_parameters_to_fp16"]
-
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index d21d95b097e3b6b813dcd7d05449ca97bdf1c537..d46c0c7c189b7508f2356e5ce24c56d74033e612 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -569,7 +569,7 @@ def case(pred_fn_pairs, default=None, name=None):
     This operator works like an if-elif-elif-else chain.
 
     Args:
-        pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor with shape [1], ``fn`` is a callable. All callables return the same structure of Tensors.
+        pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor whose numel should be 1 (shape [] or shape [1]), ``fn`` is a callable. All callables return the same structure of Tensors.
         default(callable, optional): Callable that returns a structure of Tensors.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -702,7 +702,7 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
     This operator is like a C++ switch/case statement.
 
     Args:
-        branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
+        branch_index(Tensor): A Tensor whose numel should be 1 (shape [] or shape [1]) to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
         branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
         default(callable, optional): Callable that returns a structure of Tensors.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -910,9 +910,9 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         branch will be executed during runtime.
 
     Args:
-        pred(Tensor): A boolean tensor whose numel should be 1. The boolean
-            value determines whether to return the result of ``true_fn`` or
-            ``false_fn`` .
+        pred(Tensor): A boolean tensor whose numel should be 1 (shape []
+            or shape [1]). The boolean value determines whether to return the
+            result of ``true_fn`` or ``false_fn`` .
         true_fn(callable, optional): A callable to be performed if ``pred`` is
             true. The default value is ``None`` .
         false_fn(callable, optional): A callable to be performed if ``pred`` is
@@ -969,7 +969,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
     if _non_static_mode():
         assert isinstance(pred, Variable), "The pred in cond must be Variable"
         assert pred.size == 1, "condition input's numel should be 1"
-        pred = pred.numpy()[0]
+        pred = pred.numpy().item()
         if pred:
             if true_fn is not None:
                 if not callable(true_fn):
diff --git a/python/paddle/static/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py
index 024c227bcae7799add5e42752f2f57106f92bdd0..6176200128c6941f24bddb7b1600f7a7aea8a1d7 100644
--- a/python/paddle/static/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
@@ -789,7 +789,7 @@ class PostTrainingQuantization:
         _logger.info("MSE searching stage ...")
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             var_tensor = var_tensor.flatten()
@@ -843,7 +843,7 @@ class PostTrainingQuantization:
         _logger.info("EMD searching stage ...")
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             var_tensor = var_tensor.flatten()
@@ -899,7 +899,7 @@ class PostTrainingQuantization:
 
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             abs_max_value = float(np.max(np.abs(var_tensor)))
@@ -940,7 +940,7 @@ class PostTrainingQuantization:
 
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             abs_max_value = float(np.max(np.abs(var_tensor)))
@@ -975,7 +975,7 @@ class PostTrainingQuantization:
 
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             min_value = float(np.min(var_tensor))
@@ -992,7 +992,7 @@ class PostTrainingQuantization:
     def _sample_histogram(self):
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if (not var_tensor.any()) or (
+            if (var_tensor.size == 0) or (
                 var_name not in self._sampling_act_histogram
             ):
                 self._zero_size_var_names.add(var_name)
@@ -1031,7 +1031,7 @@ class PostTrainingQuantization:
 
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             abs_max_value = float(np.max(np.abs(var_tensor)))
@@ -1094,7 +1094,7 @@ class PostTrainingQuantization:
         '''
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
-            if not var_tensor.any():
+            if var_tensor.size == 0:
                 self._zero_size_var_names.add(var_name)
                 continue
             var_tensor = np.abs(var_tensor)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index a5492d508103736ef44c8bb4586254ca2910838f..4cce1b01968a196250d9346ec4a4e173e21f6892 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1296,7 +1296,7 @@ def t(input, name=None):
             "tensor.transpose() instead." % len(input.shape)
         )
     if in_dygraph_mode():
-        if len(input.shape) == 1:
+        if len(input.shape) <= 1:
             return input
         # 2-D tensor
         perm = [1, 0]
@@ -1313,7 +1313,7 @@ def t(input, name=None):
         helper = LayerHelper('t', **locals())
         out = helper.create_variable_for_type_inference(input.dtype)
         input_shape = helper.create_variable_for_type_inference(input.dtype)
-        if len(input.shape) == 1:
+        if len(input.shape) <= 1:
             out = input
         else:
             helper.append_op(
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 36e3de3f53d449a7afb3859e17260753112b66ff..375f3614e5e30c827a15f25b21d9897f73805002 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -17,11 +17,11 @@
 import paddle
 
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
-from ..fluid.framework import _in_eager_mode_
+from ..fluid.framework import global_var
 from ..static import Variable
 from .layer_function_generator import templatedoc
 
-if _in_eager_mode_:
+if global_var._in_eager_mode_:
     Tensor = paddle.fluid.framework.core.eager.Tensor
 else:
     from ..framework import VarBase as Tensor
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 0a898caa3fb2f38791ca2e73215674d7d95ceefa..f5c57a312d84583bcf367ccf85666ce80363a2ec 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -623,6 +623,7 @@ HIGH_PARALLEL_JOB_NEW = [
     'test_dataset_consistency_inspection',
     'test_cuda_empty_cache',
     'test_cuda_graph',
+    'test_cuda_graph_static_mode',
     'test_disable_signal_handler',
     'test_eig_op',
     'test_eigh_op',
@@ -2509,6 +2510,7 @@ TETRAD_PARALLEL_JOB = [
     'test_dlpack',
     'test_complex_variable',
     'test_cuda_graph',
+    'test_cuda_graph_static_mode',
     'test_custom_grad_input',
     'test_accuracy_op',
     'test_pool1d_api',