diff --git a/.gitignore b/.gitignore
index 5018bf56c1633237b98d29a66eb86aed41fa6891..ce0cd3bc27b6225a8e6e24a8331022e6224603ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,5 @@ paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
+paddle/fluid/pybind/eager_final_state_op_function_impl.h
+paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8321010d389ee2493ef35d74d5d75d3ea73bfe9..a4c1b9c8098e9e632a4a05c491e07b1ce051c945 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup ji
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
     set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr
   return()
 endif()
 
+if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
+set(WITH_CUSTOM_DEVICE ON)
+endif()
+
 if(WIN32)
     if(WITH_DISTRIBUTE)
         MESSAGE(WARNING
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9ebde06bd01ab9968b9cc53a3e38a2b2e1684fc4..20a35c91bdde1d606cef2b46ad8aabb5952bd7d8 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -219,3 +219,7 @@ endif(ON_INFER)
 if(WITH_CRYPTO)
     add_definitions(-DPADDLE_WITH_CRYPTO)
 endif(WITH_CRYPTO)
+
+if(WITH_CUSTOM_DEVICE AND NOT WIN32)
+    add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
+endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index e9180c4fc9bb43cd2070e5bc93c74c7a9ee6510a..b099831738599ef4aaedd444d0a5d3721bd1aba8 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -55,6 +55,7 @@ IF(NOT WIN32)
         INSTALL_COMMAND     make install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
+        BUILD_BYPRODUCTS    ${CBLAS_LIBRARIES}
     )
 ELSE(NOT WIN32)
     SET(CBLAS_LIBRARIES
@@ -83,6 +84,8 @@ ELSE(NOT WIN32)
         CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
                             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                             -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+        # ninja need to know where openblas.lib comes from
+        BUILD_BYPRODUCTS    ${CBLAS_LIBRARIES}
         )
     SET(OPENBLAS_SHARED_LIB  ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
 ENDIF(NOT WIN32)
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 40b0a8b55e17a2eca26bb2c4d94221054724c530..941d470f87935f95abe5d599c9b7fa7a2730228b 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -53,7 +53,6 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
   } else if (input_data.dtype == DistModelDataType::INT32) {
     input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
   } else {
-    // Q(fleet exe dev): for input/output, should we support fp16
     LOG(ERROR) << "unsupported feed type " << input_data.dtype;
     return false;
   }
@@ -113,14 +112,6 @@ std::string DistModelDTypeToString(DistModelDataType dtype) {
   return "NOT SUPPORT DTYPE";
 }
 
-bool IsPPFirstStage(const DistModelConfig &config) {
-  return config.local_rank - config.mp_degree < 0;
-}
-
-bool IsPPLastStage(const DistModelConfig &config) {
-  return config.local_rank + config.mp_degree >= config.nranks;
-}
-
 class DistModelTimer {
  public:
   void tic() { tic_time = std::chrono::high_resolution_clock::now(); }
@@ -197,65 +188,34 @@ bool DistModel::PreparePlace() {
 }
 
 bool DistModel::CommInit() {
-  // NOTE (Yuang Liu): The peer endpoints will be obtained with the assumption
-  // that mp part is always on inner side and pp part is always on outer side.
-  // TODO(fleet exe dev): The peer endpoints could be configured by users.
-  PADDLE_ENFORCE_EQ(
-      config_.pp_degree * config_.mp_degree, config_.nranks,
-      platform::errors::InvalidArgument(
-          "The mp_degree multiplies pp_degree is not equal with nranks"));
   std::unique_ptr<framework::ProgramDesc> comm_init_program(
       new framework::ProgramDesc());
   framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
-  if (config_.mp_degree > 1) {
-    PADDLE_ENFORCE_GE(
-        config_.mp_ring_id, 0,
-        platform::errors::InvalidArgument(
-            "mp ring id must be provided for inference under mp."));
-    VLOG(3) << "Init comm group for mp.";
+  std::vector<int64_t> &ring_ids =
+      config_.rank_to_ring_ids_[config_.local_rank];
+  int64_t order = 0;
+  std::string var_name_base = "comm_init_";
+  for (int64_t ring_id : ring_ids) {
+    VLOG(3) << "Init comm for ring id: " << ring_id;
+    int64_t ranks_in_group = config_.ring_id_to_ranks_[ring_id].size();
+    int64_t rank_in_group = 0;
+    std::vector<int64_t> &ranks = config_.ring_id_to_ranks_[ring_id];
+    for (int64_t rank : ranks) {
+      if (config_.local_rank == rank) {
+        break;
+      }
+      rank_in_group += 1;
+    }
     std::vector<std::string> peer_endpoints;
-    for (int64_t
-             idx = (config_.local_rank / config_.mp_degree) * config_.mp_degree,
-             i = 0;
-         i < config_.mp_degree; ++idx, ++i) {
-      if (config_.trainer_endpoints[idx] == config_.current_endpoint) {
+    for (int64_t rank : ranks) {
+      if (config_.local_rank == rank) {
         continue;
       }
-      peer_endpoints.emplace_back(config_.trainer_endpoints[idx]);
-    }
-    // get nranks in a mp group and inner group rank for local rank
-    int64_t mp_group_nranks = config_.nranks / config_.pp_degree;
-    int64_t mp_group_rank = config_.local_rank % config_.mp_degree;
-    InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints,
-                 comm_init_block, config_.mp_ring_id);
-  }
-  if (config_.pp_degree > 1) {
-    VLOG(3) << "Init comm group for pp.";
-    if (!IsPPFirstStage(config_)) {
-      PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true,
-                        platform::errors::InvalidArgument(
-                            "pp upstream ring id must be provided for "
-                            "non-first pp stage if inference under pp."));
-      // not the first pp stage, has upstream
-      std::vector<std::string> upstream_peer_endpoints;
-      upstream_peer_endpoints.emplace_back(
-          config_.trainer_endpoints[config_.local_rank - config_.mp_degree]);
-      InsertCommOp("pp_upstream_comm_id", 2, 1, upstream_peer_endpoints,
-                   comm_init_block, config_.pp_upstream_ring_id);
-    }
-
-    if (!IsPPLastStage(config_)) {
-      PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true,
-                        platform::errors::InvalidArgument(
-                            "pp downstream ring id must be provided for "
-                            "non-last pp stage if inference under pp."));
-      // not the last pp stage, has downstream
-      std::vector<std::string> downstream_peer_endpoints;
-      downstream_peer_endpoints.emplace_back(
-          config_.trainer_endpoints[config_.local_rank + config_.mp_degree]);
-      InsertCommOp("pp_downstream_comm_id", 2, 0, downstream_peer_endpoints,
-                   comm_init_block, config_.pp_downstream_ring_id);
+      peer_endpoints.emplace_back(config_.trainer_endpoints[rank]);
     }
+    InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
+                 rank_in_group, peer_endpoints, comm_init_block, ring_id);
+    order += 1;
   }
   framework::NaiveExecutor e(place_);
   e.CreateVariables(*comm_init_program, 0, true, scope_.get());
@@ -409,12 +369,7 @@ bool DistModel::LoadParameters() {
 
 bool DistModel::PrepareFleetExe() {
   task_node_.reset(new TaskNode(program_.get(), config_.local_rank));
-  if (config_.local_rank - config_.mp_degree >= 0) {
-    task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree);
-  }
-  if (config_.local_rank + config_.mp_degree < config_.nranks) {
-    task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree);
-  }
+  // With auto cut, there is no concept of pp, no need to add dependency.
   task_node_->SetType("Compute");
   task_node_->Init();
   executor_desc_ = FleetExecutorDesc();
@@ -473,40 +428,13 @@ bool DistModel::PrepareFeedAndFetch() {
     }
   }
 
-  if (config_.pp_degree == 1) {
-    if (feeds_.size() == 0) {
-      LOG(ERROR) << "No feed ops in the inf program, please check the program.";
-      return false;
-    }
-    if (fetches_.size() == 0) {
-      LOG(ERROR) << "No fetch op in the inf program, please check the program.";
-      return false;
-    }
-  } else {
-    if (IsPPFirstStage(config_)) {
-      if (feeds_.size() == 0) {
-        LOG(ERROR) << "Feed ops are needed for the first pp stage.";
-        return false;
-      }
-    } else {
-      if (feeds_.size() > 0) {
-        LOG(WARNING) << "Feed op is found in the non-first stage of pp.";
-      } else {
-        LOG(INFO) << "No feed ops in non-first pp stage.";
-      }
-    }
-    if (IsPPLastStage(config_)) {
-      if (fetches_.size() == 0) {
-        LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure "
-                        "the result has been sent to frist pp stage.";
-      }
-    } else {
-      if (fetches_.size() > 0) {
-        LOG(WARNING) << "Fetch op is found in the non-last stage of pp.";
-      } else {
-        LOG(INFO) << "No fetch op in non-last pp stage.";
-      }
-    }
+  if (feeds_.size() == 0) {
+    LOG(ERROR) << "No feed ops in the inf program, please check the program.";
+    return false;
+  }
+  if (fetches_.size() == 0) {
+    LOG(ERROR) << "No fetch op in the inf program, please check the program.";
+    return false;
   }
   return true;
 }
@@ -606,7 +534,6 @@ bool DistModel::FetchResult(const framework::LoDTensor &fetch,
 
 bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
                     std::vector<DistModelTensor> *output_data) {
-  // TODO(fleet exe dev): support pipeline inf mode
   VLOG(3) << "DistModel run for once.";
 
   DistModelTimer timer;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index c980178b67c5244e751a8e89b945f353110a7456..d0203c131357c749b7df20a345982d2ddd025783 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -47,12 +48,9 @@ struct DistModelConfig {
   std::string current_endpoint{};
   int64_t nranks{1};
   int64_t local_rank{0};
-  int64_t mp_degree{1};
-  int64_t pp_degree{1};
-  int64_t mp_ring_id{-1};
-  int64_t pp_upstream_ring_id{-1};
-  int64_t pp_downstream_ring_id{-1};
   bool enable_timer{false};
+  std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks_{};
+  std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids_{};
 };
 
 class DistModel {
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 07d2a0f6b727aa56ef804e5ca9dee8e7a86e2cdb..643ef52e87bdaff0d531a68922077a8877830a9f 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(fill_constant);
 
 namespace paddle {
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index ab3b33d411c0e09f37885491e93144a2577d5c40..5dc8709679e25a48f2aa047b0404092ac8c1dc66 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1227,11 +1227,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
         // Forward Function Body
         // According to fwd_inputs_name_pos_map
-        std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>
+        std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>
   ins =
                 { {"X" , TrySyncToVars(X)}, { "Y" , TrySyncToVars(Y)} };
 
-        std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>
+        std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>
   outs =
   {
           {"Out0" , CreateVars(Out0Num)}, {"Out1"
@@ -1316,7 +1316,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   const char* FWD_INS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> ins = { "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
       "%s };\n";
   std::string ins_map_str =
       paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
@@ -1353,8 +1353,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     if (op_passing_outs_map[op_type].count(output_name)) {
       const std::string output_var_name = output_name + "Var";
 
-      // Pass Output from function argument(EagerTensor*/vector<EagerTensor*>&),
-      // in form of shared_ptr<EagerTensor>/vector<shared_ptr<EagerTensor>>
+      // Pass Output from function
+      // argument(EagerVariable*/vector<EagerVariable*>&),
+      // in form of shared_ptr<EagerVariable>/vector<shared_ptr<EagerVariable>>
       if (output.duplicable()) {
         const char* FWD_NUM_ARG_TEMPLATE =
             ", std::vector<paddle::experimental::Tensor*>& %s";
@@ -1395,7 +1396,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       } else {
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", "
-            "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
+            "{std::make_shared<egr::EagerVariable>(egr::Controller::Instance()."
             "GenerateUniqueName())}},";
         outs_contents_str +=
             paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name);
@@ -1407,7 +1408,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> outs = { "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> outs = { "
       "%s };\n";
   std::string outs_map_str =
       paddle::string::Sprintf(FWD_OUTS_MAP_TEMPLATE, outs_contents_str);
@@ -1482,7 +1483,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     generated_function_body += out_tensor_str;
   }
   generated_function_body += "\n";
-  VLOG(6) << "Converted Output VarBase to EagerTensor(s)";
+  VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
 
   // [Generation] Handle core_ops_returns_info
   core_ops_returns_info[op_type] = return_contents;
@@ -1627,7 +1628,7 @@ static std::string GenerateSingleOpBase(
 
   const char* BWD_INS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
       "%s };\n";
   std::string ins_map_str =
       paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
@@ -1704,7 +1705,7 @@ static std::string GenerateSingleOpBase(
         } else {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
               "{ \"%s\", "
-              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
+              "{std::make_shared<egr::EagerVariable>(egr::Controller::Instance("
               ")."
               "GenerateUniqueName())}},";
           outs_contents_str += paddle::string::Sprintf(
@@ -1723,7 +1724,7 @@ static std::string GenerateSingleOpBase(
 
   const char* BWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
       "%s };\n";
   std::string outs_map_str = paddle::string::Sprintf(
       BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 2326ab012e3caef34b6b70950dcc1088111ab9e5..19ce457df60cba5e1a1a044f0c7f43a7cbda06d9 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -40,36 +40,28 @@
  * **/
 
 namespace egr {
-class EagerTensor final {
+class EagerVariable final {
  public:
   /* Default constructor and name constructor should only be used for contruct
    * output and in fluid*/
-  EagerTensor() = default;
+  EagerVariable() = default;
 
-  explicit EagerTensor(const std::string& name) : name_(name) {}
+  explicit EagerVariable(const std::string& name) : name_(name) {}
 
-  explicit EagerTensor(const paddle::experimental::Tensor& tensor)
+  explicit EagerVariable(const paddle::experimental::Tensor& tensor)
       : name_(tensor.name()) {
     if (tensor.defined()) {
       if (tensor.is_dense_tensor()) {
-        auto* framework_tensor =
-            var_.GetMutable<paddle::framework::LoDTensor>();
-        // Contruct framework::Tensor from egr::EagerTensor
-        auto tensor_dense =
-            std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
-        PADDLE_ENFORCE_EQ((tensor_dense.get() && tensor_dense), true,
-                          paddle::platform::errors::Fatal(
-                              "Failed to Trans Tensor to EagerVariable since "
-                              "we got Tensor with type DenseTensor, and we got "
-                              "EagerVariable with another type."));
-        *framework_tensor = *tensor_dense;
+        ConstructVariableFromTensor(tensor);
+      } else if (tensor.is_selected_rows()) {
+        ConstructVariableFromSelectedRows(tensor);
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unrecognized egr::EagerVariable type, only "
-            "DenseTensor and SelectedRows is supported for now."));
+            "DenseTensor and SelectedRows are supported for now."));
       }
     } else {
-      VLOG(6) << "Build Empty EagerTensor with name " << name_;
+      VLOG(6) << "Build Empty EagerVariable with name " << name_;
     }
   }
 
@@ -77,21 +69,20 @@ class EagerTensor final {
   std::shared_ptr<pten::TensorBase> GetTensorBase() {
     // Construct allocation only once.
     if (var_.IsInitialized()) {
-      if (var_.IsType<paddle::framework::LoDTensor>()) {
-        return SetImplWithLegacyTensor<pten::DenseTensor>();
-      } else if (var_.IsType<paddle::framework::Tensor>()) {
-        return SetImplWithLegacyTensor<pten::DenseTensor>();
+      if (var_.IsType<paddle::framework::LoDTensor>() ||
+          var_.IsType<paddle::framework::Tensor>()) {
+        return SetImplWithLegacyTensor();
       } else if (var_.IsType<pten::SelectedRows>()) {
-        return SetImplWithSelectedRows();
+        return SetImplWithLegacySelectedRows();
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unable to fetch underlying tensor "
-            "from EagerTensor, only LoDTensor and "
+            "from EagerVariable, only LoDTensor and "
             "Tensor are supported for now"));
       }
     } else {
       PADDLE_THROW(paddle::platform::errors::Fatal(
-          "Can not Sync EagerTensor %s whose paddle::framework::Variable is "
+          "Can not Sync EagerVariable %s whose paddle::framework::Variable is "
           "not initialized!",
           name()));
     }
@@ -107,23 +98,52 @@ class EagerTensor final {
   void set_name(const std::string& name) { name_ = name; }
 
  private:
-  template <typename LEGACY_TYPE>
   std::shared_ptr<pten::TensorBase> SetImplWithLegacyTensor() {
-    const auto& framework_tensor = var_.Get<LEGACY_TYPE>();
+    const auto& framework_tensor = var_.Get<pten::DenseTensor>();
     VLOG(8) << "Sync Var to tensor for: " << name();
-    return std::make_shared<LEGACY_TYPE>(std::move(framework_tensor));
+    return std::make_shared<pten::DenseTensor>(framework_tensor);
   }
 
-  std::shared_ptr<pten::TensorBase> SetImplWithSelectedRows() {
-    auto* selected_rows = var_.GetMutable<pten::SelectedRows>();
-    auto res = std::make_shared<pten::SelectedRows>(selected_rows->rows_,
-                                                    selected_rows->height_);
-    res->value_.reset(selected_rows->value_.release());
-    res->id_to_index_ = std::move(selected_rows->id_to_index_);
-    res->rwlock_.reset(selected_rows->rwlock_.release());
+  std::shared_ptr<pten::TensorBase> SetImplWithLegacySelectedRows() {
+    auto* framework_tensor = var_.GetMutable<pten::SelectedRows>();
+    VLOG(8) << "Sync SelectedRows to tensor for: " << name();
+    auto res =
+        std::make_shared<pten::SelectedRows>(std::move(*framework_tensor));
+    var_.Clear();
     return res;
   }
 
+  void ConstructVariableFromTensor(const paddle::experimental::Tensor& tensor) {
+    auto* framework_tensor = var_.GetMutable<pten::DenseTensor>();
+    // Contruct framework::Tensor from egr::EagerVariable
+    auto tensor_dense =
+        std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
+    PADDLE_ENFORCE_EQ(
+        (tensor_dense.get() && tensor_dense), true,
+        paddle::platform::errors::Fatal(
+            "Tensor %s does not hold pten::SelectedRows or pten::DenseTensor. "
+            "Or it holds empty impl, this should not happend since we should "
+            "treat all kinds of tensor as what they are.",
+            tensor.name()));
+    *framework_tensor = *tensor_dense;
+  }
+
+  void ConstructVariableFromSelectedRows(
+      const paddle::experimental::Tensor& tensor) {
+    auto* framework_tensor = var_.GetMutable<pten::SelectedRows>();
+    // Contruct framework::Tensor from egr::EagerVariable
+    auto tensor_dense =
+        std::dynamic_pointer_cast<pten::SelectedRows>(tensor.impl());
+    PADDLE_ENFORCE_EQ(
+        (tensor_dense.get() && tensor_dense), true,
+        paddle::platform::errors::Fatal(
+            "Tensor %s does not hold pten::SelectedRows or pten::DenseTensor. "
+            "Or it holds empty impl, this should not happend since we should "
+            "treat all kinds of tensor as what they are.",
+            tensor.name()));
+    *framework_tensor = std::move(*tensor_dense);
+  }
+
  private:
   std::string name_{""};
   paddle::framework::Variable var_;
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 56fdf542bbb93ec28c0dc21bacf38eedb3968bd0..6a8720c1cc27de41a91b40c29ae9d08b99ccb09e 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -78,9 +78,9 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
         if (buffer_tensor.is_dense_tensor()) {
           paddle::imperative::SelectedRowsAddToTensor(t, &buffer_tensor);
         } else {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "We don't support Selected Rows merge for now, support it later "
-              "and make all kinds of grads can be merged."));
+          buffer_tensor =
+              std::move(*paddle::imperative::SelectedRowsMerge<
+                        paddle::experimental::Tensor>(t, buffer_tensor));
         }
       }
     }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index c27d1871e398164ad976c73919499ceed3938057..e3bb53106776604d1c2fee0a53fc6d87a9d83755 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -115,7 +115,7 @@ TEST(Tensor, MemberFunction) {
   CHECK_EQ(tmp_autograd_meta_test->val_, 2);
 }
 
-TEST(EagerTensor, Constructor) {
+TEST(EagerVariable, Constructor) {
   paddle::experimental::Tensor t3;
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
@@ -134,7 +134,7 @@ TEST(EagerTensor, Constructor) {
   CHECK_EQ(t3.defined(), false);
   t3.set_impl(dt);
 
-  egr::EagerTensor et3 = egr::EagerTensor(t3);
+  egr::EagerVariable et3 = egr::EagerVariable(t3);
   VLOG(6) << "SyncToVar";
   CHECK_EQ(et3.Var().Get<paddle::framework::LoDTensor>().data<float>()[0],
            5.0f);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index b771ff28d8ee2d762f5bca717942d4a57c155984..734a611d07b57b6e8e31933cf2683e60efff487a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/selected_rows.h"
 
 #include "paddle/pten/core/kernel_registry.h"
 
@@ -102,3 +103,69 @@ TEST(GradTensorHolder, Interfaces) {
   CHECK_EQ(holder_et0_ptr[0], 1.0f);
   CHECK_EQ(holder_et1_ptr[0], 30.0f);
 }
+
+TEST(GradTensorHolder, SelectedRowsMergeAdd) {
+  pten::CPUPlace cpu;
+
+  std::vector<int64_t> rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  int64_t table_size = 10;
+  int64_t embedding_width = 10;
+
+  auto sr1 = std::make_shared<pten::SelectedRows>(rows, table_size);
+  auto sr2 = std::make_shared<pten::SelectedRows>(rows, table_size);
+
+  // initialize a sparse table 1
+  sr1->mutable_value()->Resize(
+      pten::framework::make_ddim({table_size, embedding_width}));
+  auto* data_sr1 = sr1->mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data_sr1[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+
+  // initialize a sparse table 2
+  sr2->mutable_value()->Resize(
+      pten::framework::make_ddim({table_size, embedding_width}));
+  auto* data_sr2 = sr2->mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data_sr2[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  // new 2 pten::Tensor
+  paddle::experimental::Tensor t1(sr1);
+  paddle::experimental::Tensor t2(sr2);
+
+  // Constructor empty GradTensorHolder
+  GradSlotMeta slot_meta;
+  slot_meta.Init(1);
+  GradTensorHolder grad_tensor_holder =
+      GradTensorHolder({slot_meta, slot_meta});
+
+  // accumulation
+  grad_tensor_holder.add(0, 0, t1, false);
+  grad_tensor_holder.add(0, 0, t2, false);
+
+  // Buffers()
+  const auto& buffers = grad_tensor_holder.Buffers();
+  CHECK_EQ(static_cast<int>(buffers.size()), 2);
+  CHECK_EQ(static_cast<int>(buffers[0].size()), 1);
+  CHECK_EQ(static_cast<int>(buffers[1].size()), 1);
+
+  // operator[]
+  const auto& holder_et0 = grad_tensor_holder[0][0];
+
+  auto* tmp_buffer_tensor =
+      static_cast<pten::SelectedRows*>(holder_et0.impl().get());
+  auto* tmp_buffer_data_sr =
+      tmp_buffer_tensor->mutable_value()->mutable_data<float>(cpu);
+
+  // verify the MergeAdd result (accumulation result)
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      EXPECT_EQ(tmp_buffer_data_sr[i * embedding_width + j],
+                (static_cast<float>(i) + static_cast<float>(i)));
+    }
+  }
+}
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 176a02d896384f90226eb196436a9a41670852a7..8aa6b7b8460749911a9f7187564aa1195006b537 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -176,6 +176,6 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
 }
 
 USE_OP_ITSELF(scale);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index d2bc05f41b532238c688960087dba6ce1281331f..53d97b2919a5bf6b1a7b0c99b3ed46b5f70b27ef 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -189,6 +189,6 @@ USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index c2f0479460064e05fc917ec432a7384e43e73cf3..0b2585905d3eda09b2565812f918949ed7f2ffba 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -212,6 +212,6 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace paddle
 
 USE_OP_ITSELF(scale);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 250005e31150c3c9d83d3d094ccb4e00b2de7429..9cebb73a34a7ff6541a499bdd4f36997034f4bf1 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -249,6 +249,6 @@ USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index c11bd94ee9369f983684be38fbb811d87968791a..db3d2cf519c6ddc892e0502dfcee6914d3e594a8 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -16,6 +16,7 @@
 
 #include "gtest/gtest.h"
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
@@ -167,7 +168,7 @@ TEST(EagerUtils, PassStopGradient) {
 TEST(EagerUtils, TrySyncToVar) {
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
   auto tensor = CreateTestCPUTensor(5.0f, ddim);
-  std::vector<std::shared_ptr<egr::EagerTensor>> var_bases = {
+  std::vector<std::shared_ptr<egr::EagerVariable>> var_bases = {
       egr::EagerUtils::TrySyncToVar(tensor)};
 
   paddle::framework::Variable* var = var_bases[0]->MutableVar();
@@ -187,7 +188,7 @@ TEST(EagerUtils, TrySyncToVars) {
   std::vector<paddle::experimental::Tensor> tensors = {
       CreateTestCPUTensor(1.0f, ddim), CreateTestCPUTensor(2.0f, ddim)};
 
-  std::vector<std::shared_ptr<egr::EagerTensor>> var_bases =
+  std::vector<std::shared_ptr<egr::EagerVariable>> var_bases =
       egr::EagerUtils::TrySyncToVars(tensors);
 
   {
@@ -218,10 +219,32 @@ TEST(EagerUtils, TrySyncToVars) {
 
 TEST(EagerUtils, CreateVars) {
   VLOG(6) << "Check CreateVars";
-  std::vector<std::shared_ptr<egr::EagerTensor>> outs =
+  std::vector<std::shared_ptr<egr::EagerVariable>> outs =
       egr::EagerUtils::CreateVars(2);
   CHECK_EQ(outs.size(), size_t(2));
   CHECK(outs[0]->Var().IsInitialized() == false);
 }
 
+TEST(EagerUtils, GetGradAccumulationNode) {
+  VLOG(6) << "Check GetGradAccumulationNode";
+  paddle::experimental::Tensor t0("test_tensor");
+  ASSERT_EQ(egr::EagerUtils::GetGradAccumulationNode(t0), nullptr);
+  auto autograd_ptr0 = egr::EagerUtils::autograd_meta(&t0);
+  autograd_ptr0->SetStopGradient(true);
+  ASSERT_EQ(egr::EagerUtils::GetGradAccumulationNode(t0), nullptr);
+  autograd_ptr0->SetStopGradient(false);
+  auto res = std::dynamic_pointer_cast<egr::GradNodeAccumulation>(
+      egr::EagerUtils::GetGradAccumulationNode(t0));
+  ASSERT_TRUE(res != nullptr);
+  auto res2 = egr::EagerUtils::GetGradAccumulationNode(t0);
+  ASSERT_EQ(res2.get(), res.get());
+  autograd_ptr0->SetStopGradient(true);
+  auto res3 = egr::EagerUtils::GetGradAccumulationNode(t0);
+  ASSERT_EQ(res3, nullptr);
+  autograd_ptr0->SetStopGradient(false);
+  autograd_ptr0->SetGradNode(
+      std::make_shared<eager_test::GradTestNode>(1, 2.0, 3));
+  ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 5b95b43edea82b8beac9c46fe81651784f608274..e3bdba05e97365fb177e6130d5ceaab9f7838529 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -123,5 +123,5 @@ TEST(Generated, ElementwiseAdd) {
 }  // namespace egr
 
 USE_OP(sigmoid);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 7be70ff957565b2246e0e0fd8636816633f7e5c8..a8c27e86b877ae7483e3c52c87d19308b9a48907 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -21,6 +21,7 @@
 #include "paddle/pten/common/layout.h"
 #include "paddle/pten/core/tensor_meta.h"
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/variable.h"
@@ -131,17 +132,17 @@ void EagerUtils::SetOutRankWithSlot(AutogradMeta* target, size_t slot_id) {
   target->SetSingleOutRankWithSlot(slot_id, 0);
 }
 
-std::shared_ptr<egr::EagerTensor> EagerUtils::TrySyncToVar(
+std::shared_ptr<egr::EagerVariable> EagerUtils::TrySyncToVar(
     const paddle::experimental::Tensor& tensor) {
-  return std::make_shared<egr::EagerTensor>(tensor);
+  return std::make_shared<egr::EagerVariable>(tensor);
 }
 
-std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
+std::vector<std::shared_ptr<egr::EagerVariable>> EagerUtils::TrySyncToVars(
     const paddle::experimental::Tensor& tensor) {
   return {TrySyncToVar(tensor)};
 }
 
-std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
+std::vector<std::shared_ptr<egr::EagerVariable>> EagerUtils::TrySyncToVars(
     paddle::experimental::Tensor* tensor) {
   PADDLE_ENFORCE_NOT_NULL(
       tensor,
@@ -151,9 +152,9 @@ std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
   return {TrySyncToVar(*tensor)};
 }
 
-std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
+std::vector<std::shared_ptr<egr::EagerVariable>> EagerUtils::TrySyncToVars(
     const std::vector<paddle::experimental::Tensor*>& tensors) {
-  std::vector<std::shared_ptr<EagerTensor>> res;
+  std::vector<std::shared_ptr<EagerVariable>> res;
   size_t num = tensors.size();
   res.reserve(num);
   for (size_t i = 0; i < num; i++) {
@@ -169,9 +170,9 @@ std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
   return res;
 }
 
-std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
+std::vector<std::shared_ptr<egr::EagerVariable>> EagerUtils::TrySyncToVars(
     const std::vector<paddle::experimental::Tensor>& tensors) {
-  std::vector<std::shared_ptr<EagerTensor>> res;
+  std::vector<std::shared_ptr<EagerVariable>> res;
   size_t num = tensors.size();
   res.reserve(num);
   for (size_t i = 0; i < num; i++) {
@@ -180,19 +181,19 @@ std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
   return res;
 }
 
-std::vector<std::shared_ptr<EagerTensor>> EagerUtils::CreateVars(
+std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
     const size_t num) {
-  std::vector<std::shared_ptr<EagerTensor>> res;
+  std::vector<std::shared_ptr<EagerVariable>> res;
   res.reserve(num);
   for (size_t i = 0; i < num; i++) {
     res.emplace_back(
-        new EagerTensor(egr::Controller::Instance().GenerateUniqueName()));
+        new EagerVariable(egr::Controller::Instance().GenerateUniqueName()));
   }
   return res;
 }
 
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
-    const std::vector<std::shared_ptr<EagerTensor>>& outs) {
+    const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
   res.reserve(outs.size());
   for (const auto& out : outs) {
@@ -209,7 +210,7 @@ std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
 }
 
 paddle::experimental::Tensor EagerUtils::GetOutput(
-    const std::shared_ptr<EagerTensor>& out) {
+    const std::shared_ptr<EagerVariable>& out) {
   PADDLE_ENFORCE_NOT_NULL(
       out.get(), paddle::platform::errors::Fatal(
                      "Eager Tensor %s is null and cannot be copied. We "
@@ -219,7 +220,7 @@ paddle::experimental::Tensor EagerUtils::GetOutput(
   return paddle::experimental::Tensor(out->GetTensorBase(), out->name());
 }
 
-void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerTensor>& out,
+void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
                                   paddle::experimental::Tensor* tensor) {
   PADDLE_ENFORCE_NOT_NULL(
       tensor, paddle::platform::errors::Fatal(
@@ -231,7 +232,7 @@ void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerTensor>& out,
 }
 
 void EagerUtils::OverwriteOutputs(
-    const std::vector<std::shared_ptr<EagerTensor>>& outs,
+    const std::vector<std::shared_ptr<EagerVariable>>& outs,
     const std::vector<paddle::experimental::Tensor*>& tensors) {
   PADDLE_ENFORCE_EQ(
       outs.size(), tensors.size(),
@@ -303,4 +304,41 @@ void EagerUtils::CheckAndRetainGrad(
   }
 }
 
+std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
+    const paddle::experimental::Tensor& tensor) {
+  auto* autograd_ptr = nullable_autograd_meta(tensor);
+  if (!autograd_ptr) {
+    return nullptr;
+  }
+  auto node_ptr = autograd_ptr->GetMutableGradNode();
+  if (node_ptr && node_ptr.get()) {
+    if (!autograd_ptr->StopGradient()) {
+      auto accumulation_ptr =
+          std::dynamic_pointer_cast<GradNodeAccumulation>(node_ptr);
+      if (accumulation_ptr) {
+        return accumulation_ptr;
+      } else {
+        // Current GradNode is not a egr::GradNodeAccumulation
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "GetGradAccumulationNode should only be called on leaf tensor, but "
+            "target tensor: %s has GradNode which is not a "
+            "GradNodeAccumulation, and this should not happend unless target "
+            "tensor is modified by some ops and calling set history for it.",
+            tensor.name()));
+      }
+    } else {
+      // Current Tensor does not have grad since it's stop_gradient is true;
+      return nullptr;
+    }
+  } else {
+    if (!autograd_ptr->StopGradient()) {
+      VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name();
+      autograd_ptr->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      return autograd_ptr->GetMutableGradNode();
+    } else {
+      return nullptr;
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index b0549488efc8f2e85d5550251bfffc9dac3a1af7..11c728e4c6c9bdd3e3ee60fb474200ff5ae20afc 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -88,7 +88,7 @@ class EagerUtils {
   /**
    * We have to use autograd_meta and multi_autograd_meta to initialize
    * autograd_meta for tensor, since we can't init it in
-   * egr::EagerTensor's
+   * egr::EagerVariable's
    * constructor (it's abstract class there)
    *
    * **/
@@ -151,34 +151,35 @@ class EagerUtils {
 
   // Intermidate needed remove this once we don't need legacy
   // Inner Method
-  static std::shared_ptr<egr::EagerTensor> TrySyncToVar(
+  static std::shared_ptr<egr::EagerVariable> TrySyncToVar(
       const paddle::experimental::Tensor& tensor);
   // Basic Input
-  static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
+  static std::vector<std::shared_ptr<egr::EagerVariable>> TrySyncToVars(
       const paddle::experimental::Tensor& tensor);
   // Basic Output
-  static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
+  static std::vector<std::shared_ptr<egr::EagerVariable>> TrySyncToVars(
       paddle::experimental::Tensor* tensor);
   // Multi Output
-  static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
+  static std::vector<std::shared_ptr<egr::EagerVariable>> TrySyncToVars(
       const std::vector<paddle::experimental::Tensor*>& tensors);
   // Multi Input
-  static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
+  static std::vector<std::shared_ptr<egr::EagerVariable>> TrySyncToVars(
       const std::vector<paddle::experimental::Tensor>& tensors);
   // Construct empty output
-  static std::vector<std::shared_ptr<EagerTensor>> CreateVars(const size_t num);
+  static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
+      const size_t num);
   // Construct Tensor From var
   static std::vector<paddle::experimental::Tensor> GetOutputs(
-      const std::vector<std::shared_ptr<EagerTensor>>& outs);
+      const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
-      const std::shared_ptr<EagerTensor>& out);
+      const std::shared_ptr<EagerVariable>& out);
   // Sync Back to origin output Tensor
-  static void OverwriteOutputs(const std::shared_ptr<EagerTensor>& out,
+  static void OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
                                paddle::experimental::Tensor* tensor);
   static void OverwriteOutputs(const paddle::experimental::Tensor& out,
                                paddle::experimental::Tensor* tensor);
   static void OverwriteOutputs(
-      const std::vector<std::shared_ptr<EagerTensor>>& outs,
+      const std::vector<std::shared_ptr<EagerVariable>>& outs,
       const std::vector<paddle::experimental::Tensor*>& tensors);
   static void OverwriteOutputs(
       const std::vector<paddle::experimental::Tensor>& outs,
@@ -188,6 +189,8 @@ class EagerUtils {
   static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
   static void CheckAndRetainGrad(
       const std::vector<paddle::experimental::Tensor>& tensors);
+  static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
+      const paddle::experimental::Tensor& tensor);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a3f0ed392646c370e731f2d2f573f3dde348a5c9..78f5bb077aaf189ff0d21aba853d62aebe46f53e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -413,7 +413,7 @@ cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tens
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
 cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference)
-
+cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor)
 
 # Get the current working branch
 execute_process(
@@ -458,4 +458,5 @@ if(WITH_GPU OR WITH_ROCM)
 else()
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
+cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
 cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
diff --git a/paddle/pten/tests/core/test_convert_utils.cc b/paddle/fluid/framework/convert_utils_test.cc
similarity index 100%
rename from paddle/pten/tests/core/test_convert_utils.cc
rename to paddle/fluid/framework/convert_utils_test.cc
index 977e49aafb9bd4e84e6626e1f3bbe16a30ef4c52..d547070e6d1f092f5a65ccfef6d743de6e6331e2 100644
--- a/paddle/pten/tests/core/test_convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils_test.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gtest/gtest.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "gtest/gtest.h"
 
 namespace pten {
 namespace tests {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 24f1591ff33c965b9b787c05ff5db67ad4362ea4..20d08ef18aeb3e4d8a9f5cfd0b38954daf27020d 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
         platform::errors::Unimplemented("platform::MLUPlace is not supported"));
   }
 
+  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::CustomPlace is not supported"));
+  }
+
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLDevice device;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5596aba52131b74785741e16f9dc6ef71e6a91cb..4e6a4d5360860e8971c6dc9c2842defabcffd0dd 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
+#endif
+    } else if (platform::is_custom_place(place_)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for " << place_ << ".";
+        gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_,
+                                                            max_memory_size));
+      } else {
+        VLOG(4) << "Use default stream gc for " << place_ << ".";
+        gc.reset(
+            new CustomDefaultStreamGarbageCollector(place_, max_memory_size));
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found"));
 #endif
     }
   }
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 22f77be85055578f0d4e8288e90001fb59e9628d..9f2bdeffecf62764f5cbe5bea9cb50d4830be43b 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -18,6 +18,7 @@
 #endif
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_double(memory_fraction_of_eager_deletion);
@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback(
 }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void CustomDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+CustomStreamGarbageCollector::CustomStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::DeviceGuard guard(place);
+  stream_.reset(new platform::stream::Stream);
+  stream_->Init(place);
+  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+}
+
+CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
+  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  stream_->Synchronize();
+  stream_->Destroy();
+}
+
+platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+  return stream_.get();
+}
+
+void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void CustomStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+
 int64_t GetEagerDeletionThreshold() {
   return FLAGS_eager_delete_tensor_gb < 0
              ? -1
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f5d79d864b5659ed2b16cdded7e471eca457e3c5..a67860c6087e0f173e09d2a7c131703260c562fd 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place,
+                                      size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place,
+                                         size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomStreamGarbageCollector(const platform::CustomPlace &place,
+                               size_t max_memory_size);
+
+  ~CustomStreamGarbageCollector();
+
+  void Wait() const override;
+
+  platform::stream::Stream *stream() const;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+
+ private:
+  std::unique_ptr<platform::stream::Stream> stream_;
+  std::unique_ptr<platform::CallbackManager> callback_manager_;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
   Add(std::forward<Container>(objs), []() {});
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 9e1958973d2d97a351ef5ced57339fb698b70281..bc0344d405cf795bc96fd3fb2d5376bbde89bd2b 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 
+#include <string>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/pten_utils.h"
@@ -303,13 +305,45 @@ pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
       auto& attr = attr_reader.GetAttr(attr_name);
       if (std::type_index(attr.type()) == std::type_index(typeid(bool))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) {
+        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(int64_t))) {
+        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
       } else if (std::type_index(attr.type()) ==
                  std::type_index(typeid(float))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::string))) {
+        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<bool>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<bool>, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<int>, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<int64_t>, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<float>, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<double>, attr));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        infer_meta_context.EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr));
       } else {
-        // do nothing, skip useless attrs now
-        // TODO(chenweihang): support other attr type later and throw error
-        // if attr is cannot parsed
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported attribute type is received when call "
+            "InferShapeFunctor."));
       }
     } else {
       // do nothing
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..755ca3f5ce90b7bcc85e904089262fd7f7e401cb
--- /dev/null
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -0,0 +1,163 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/pten/core/compat/op_utils.h"
+#include "paddle/pten/core/infermeta_utils.h"
+
+namespace paddle {
+namespace framework {
+
+void TestInferMeta(bool bool_attr, int int_attr, int64_t int64_attr,
+                   float float_attr, const std::string& str_attr,
+                   const std::vector<bool>& vec_bool_attr,
+                   const std::vector<int>& vec_int_attr,
+                   const std::vector<int64_t>& vec_int64_attr,
+                   const std::vector<float>& vec_float_attr,
+                   const std::vector<double>& vec_double_attr,
+                   const std::vector<std::string>& vec_str_attr) {
+  ASSERT_EQ(bool_attr, true);
+  ASSERT_EQ(int_attr, 10);
+  ASSERT_EQ(int64_attr, 100);
+  ASSERT_NEAR(float_attr, 3.14, 1e-6);
+  ASSERT_EQ(str_attr, "test");
+  ASSERT_EQ(vec_bool_attr.at(0), true);
+  ASSERT_EQ(vec_bool_attr.at(1), true);
+  ASSERT_EQ(vec_int_attr.at(0), 10);
+  ASSERT_EQ(vec_int_attr.at(1), 10);
+  ASSERT_EQ(vec_int64_attr.at(0), 100L);
+  ASSERT_EQ(vec_int64_attr.at(1), 100L);
+  ASSERT_NEAR(vec_float_attr.at(0), 3.14, 1e-6);
+  ASSERT_NEAR(vec_float_attr.at(1), 3.14, 1e-6);
+  ASSERT_NEAR(vec_double_attr.at(0), 3.1415, 1e-6);
+  ASSERT_NEAR(vec_double_attr.at(1), 3.1415, 1e-6);
+  ASSERT_EQ(vec_str_attr.at(0), "test_vec");
+  ASSERT_EQ(vec_str_attr.at(1), "test_vec");
+}
+
+class InferShapeUtilsTestOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddAttr<bool>("bool", "bool attr of test op");
+    AddAttr<int>("int", "int attr of test op");
+    AddAttr<int64_t>("int64", "int64 attr of test op");
+    AddAttr<float>("float", "float attr of test op");
+    AddAttr<std::string>("string", "string attr of test op");
+    AddAttr<std::vector<bool>>("vec_bool", "vec_bool attr of test op");
+    AddAttr<std::vector<int>>("vec_int", "vec_int attr of test op");
+    AddAttr<std::vector<int64_t>>("vec_int64", "vec_int attr of test op");
+    AddAttr<std::vector<float>>("vec_float", "vec_int attr of test op");
+    AddAttr<std::vector<double>>("vec_double", "vec_int attr of test op");
+    AddAttr<std::vector<std::string>>("vec_str", "vec_int attr of test op");
+    AddComment("This is test op");
+  }
+};
+
+class InferShapeUtilsTestOp : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::VarType::FP32, ctx.GetPlace());
+  }
+};
+
+pten::KernelSignature InferShapeUtilsTestOpArgumentMapping(
+    const pten::ArgumentMappingContext& ctx) {
+  return pten::KernelSignature(
+      "infer_shape_utils_test", {},
+      {"bool", "int", "int64", "float", "string", "vec_bool", "vec_int",
+       "vec_int64", "vec_float", "vec_double", "vec_str"},
+      {});
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
+                            InferShapeUtilsTestInferShapeFunctor,
+                            PT_INFER_META(paddle::framework::TestInferMeta));
+REGISTER_OPERATOR(infer_shape_utils_test,
+                  paddle::framework::InferShapeUtilsTestOp,
+                  paddle::framework::InferShapeUtilsTestOpMaker,
+                  InferShapeUtilsTestInferShapeFunctor);
+
+TEST(InferShapeUtilsTest, ALL) {
+  paddle::framework::ProgramDesc prog;
+  paddle::framework::proto::BlockDesc proto_block;
+  paddle::framework::BlockDesc block_desc(&prog, &proto_block);
+
+  auto* op = block_desc.AppendOp();
+  op->SetType("infer_shape_utils_test");
+
+  paddle::framework::Attribute bool_attr(true);
+  op->SetAttr("bool", bool_attr);
+
+  paddle::framework::Attribute int_attr(10);
+  op->SetAttr("int", int_attr);
+
+  int64_t int64_val = 100;
+  paddle::framework::Attribute int64_attr(int64_val);
+  op->SetAttr("int64", int64_attr);
+
+  float float_value = 3.14;
+  paddle::framework::Attribute float_attr(float_value);
+  op->SetAttr("float", float_attr);
+
+  std::string str_value("test");
+  paddle::framework::Attribute str_attr(str_value);
+  op->SetAttr("string", str_attr);
+
+  std::vector<bool> vec_bool(2, true);
+  paddle::framework::Attribute vec_bool_attr = vec_bool;
+  op->SetAttr("vec_bool", vec_bool_attr);
+
+  std::vector<int> vec_int(2, 10);
+  paddle::framework::Attribute vec_int_attr = vec_int;
+  op->SetAttr("vec_int", vec_int_attr);
+
+  std::vector<int64_t> vec_int64(2, 100);
+  paddle::framework::Attribute vec_int64_attr = vec_int64;
+  op->SetAttr("vec_int64", vec_int64_attr);
+  std::cout << "after set vec_int64" << std::endl;
+
+  std::vector<float> vec_float(2, 3.14);
+  paddle::framework::Attribute vec_float_attr = vec_float;
+  op->SetAttr("vec_float", vec_float_attr);
+
+  std::vector<double> vec_double(2, 3.1415);
+  paddle::framework::Attribute vec_double_attr = vec_double;
+  op->SetAttr("vec_double", vec_double_attr);
+
+  std::vector<std::string> vec_str(2, "test_vec");
+  paddle::framework::Attribute vec_str_attr = vec_str;
+  op->SetAttr("vec_str", vec_str_attr);
+
+  pten::OpUtilsMap::Instance().InsertArgumentMappingFn(
+      "infer_shape_utils_test",
+      paddle::framework::InferShapeUtilsTestOpArgumentMapping);
+
+  op->InferShape(block_desc);
+}
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 829f43effb6d2878b63694d25b23ff7396ff61c2..0e1e572a51f7fcbc84415bab3808dfaed97dfd08 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -103,6 +103,8 @@ target_link_libraries(generate_pass pass_desc_proto)
 
 if(WITH_TENSORRT)
     pass_library(trt_map_matmul_to_mul_pass inference)
+    pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
+    pass_library(preln_skip_layernorm_fuse_pass inference)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 619976d45fb0d9675e09046f2fad8fc3bbf5d90a..b56c9cb13ccdc2dd1c7a1dfcd1aad6da27590cae 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
@@ -271,25 +272,41 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
   if (op_type == "c_sync_calc_stream" || op_type == "c_sync_comm_stream") {
     return true;
   }
-  auto &all_kernels = OperatorWithKernel::AllOpKernels();
-  auto it = all_kernels.find(op_type);
-  // skip op not has kernel
-  if (it != all_kernels.end()) {
-    bool support_cpu = false;
-    bool support_gpu = false;
-    for (auto &kernel_pair : it->second) {
-      if (platform::is_cpu_place(kernel_pair.first.place_)) {
-        support_cpu = true;
-      }
-      if (platform::is_gpu_place(kernel_pair.first.place_)) {
-        support_gpu = true;
+  bool support_cpu = false;
+  bool support_gpu = false;
+  auto &kernel_factory = pten::KernelFactory::Instance();
+  auto kernel_key_map =
+      kernel_factory.SelectKernelMap(pten::TransToPtenKernelName(op_type));
+  bool has_op_kernel = kernel_key_map.size() > 0 ? true : false;
+  for (auto &kernel : kernel_key_map) {
+    if (platform::is_gpu_place(
+            pten::TransToPtenPlace(kernel.first.backend()))) {
+      support_gpu = true;
+    } else if (platform::is_cpu_place(
+                   pten::TransToPtenPlace(kernel.first.backend()))) {
+      support_cpu = true;
+    }
+  }
+
+  if (!support_cpu || !support_gpu) {
+    auto &all_kernels = OperatorWithKernel::AllOpKernels();
+    auto it = all_kernels.find(op_type);
+    // skip op not has kernel
+    if (it != all_kernels.end()) {
+      has_op_kernel = true;
+      for (auto &kernel_pair : it->second) {
+        if (platform::is_cpu_place(kernel_pair.first.place_)) {
+          support_cpu = true;
+        } else if (platform::is_gpu_place(kernel_pair.first.place_)) {
+          support_gpu = true;
+        }
       }
     }
-    VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
-            << ", support GPU: " << support_gpu;
-    return support_cpu && support_gpu;
   }
-  return true;
+
+  VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
+          << ", support GPU: " << support_gpu;
+  return has_op_kernel ? (support_cpu && support_gpu) : true;
 }
 
 bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index abed6a5bd4bc48e01d9bcf20abf1bed236ed847a..ed9f6230720f83100e641068c8664d643b6db260 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -26,7 +26,7 @@
 
 USE_OP(mul);
 USE_OP(cinn_launch);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 namespace paddle::framework {
 
 using Name2VarInfoMap =
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index 746d90cef917cdb8c4740adf7dff3438c2ca1249..d33dc7f49feb0f4c9e585d13186d65b6c2d618c0 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -23,8 +23,8 @@
 
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
-USE_OP(elementwise_add);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add);
+USE_OP_ITSELF(elementwise_add_grad);
 
 DECLARE_double(eager_delete_tensor_gb);
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 5f819ddbfaf8b88732b35119014c34644a1c402b..96aa95bde337436dd6eb584b3eea5395b5301a34 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -29,7 +29,7 @@ USE_OP(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(gelu);
 USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 90dc7801131074868073e1307ae7bfc51f2c3631..ea335e9bd63c624310df2f092b13e30a9458bb93 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -22,7 +22,7 @@
 
 USE_OP(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 016d0fd4a663ecfcc8d2b23ddb2a3af7b610b6cd..acfe8d53cea13cb5ac9797ea7d43311d01b9041b 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -47,6 +47,8 @@ constexpr char kPassRecorder[] = "pass_recorder";
 constexpr char kEmbEltwiseLayernormPass[] =
     "embedding_eltwise_layernorm_fuse_pass_flag";
 constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";
+constexpr char kPrelnEmbEltwiseLayernormPass[] =
+    "preln_embedding_eltwise_layernorm_fuse_pass_flag";
 
 class Pass {
  public:
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca42a613411ba6078b00522d2c178856993fa462
--- /dev/null
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -0,0 +1,450 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
+                               const std::string& arg,
+                               bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node =
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
+  if (is_persist) return node->assert_is_persistable_var();
+  return node;
+}
+static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
+                                   const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node = pattern->NewNode(name)
+                     ->assert_is_only_output_of_ops(embedding_ops)
+                     ->assert_is_op_input("elementwise_add", arg)
+                     ->AsIntermediate();
+  return node;
+}
+void PrelnEmbedding2Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table2_x =
+      create_emb_vars(pattern, lookup_table2_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  auto* lookup_table2_w =
+      create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table2 =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
+  auto* lookup_table2_out =
+      create_emb_out_vars(pattern, lookup_table2_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  lookup_table2->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  eltwise_add->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({eltwise_add_out});
+}
+void PrelnEmbedding1Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_in = pattern->NewNode(eltwise_add_in_repr())
+                             ->assert_is_op_input("elementwise_add", "X")
+                             ->assert_is_op_output("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  eltwise_add->LinksFrom({lookup_table1_out, eltwise_add_in})
+      .LinksTo({eltwise_add_out});
+}
+void PrelnSkipLayerNorm::operator()() {
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr())
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  eltwise_add->LinksTo({eltwise_add_out});
+  layer_norm
+      ->LinksFrom({eltwise_add_out, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var});
+}
+
+}  // namespace patterns
+
+int PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(
+    Graph* graph, const std::string& name_scope
+    /*const Scope* scope*/) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  std::vector<std::vector<std::pair<Node*, Node*>>> start_pattern_in_nodes;
+  std::vector<Node*> start_pattern_out_node;
+  std::vector<std::unordered_set<Node*>> start_pattern_remove_nodes;
+
+  // Create pattern.
+  patterns::PrelnEmbedding2Eltwise1Pattern start_pattern(pattern,
+                                                         name_scope + "/start");
+  start_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_w, lookup_table2_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2, lookup_table2, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out,
+                              start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_out, lookup_table2_out,
+                              start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING)
+          << "Pass(PrelnEmbedding2Eltwise1Pattern) in op compat failed.";
+      return;
+    }
+    std::vector<std::pair<Node*, Node*>> ins;
+    ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w));
+    ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w));
+    start_pattern_in_nodes.push_back(ins);
+    start_pattern_out_node.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({lookup_table1, lookup_table2, lookup_table1_out,
+                     lookup_table2_out, eltwise_add, eltwise_add_out});
+    start_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd(graph, handler);
+
+  std::vector<std::pair<Node*, Node*>> inner_pattern_ins;
+  std::vector<Node*> inner_pattern_tmp_in;
+  std::vector<Node*> inner_pattern_out;
+  std::vector<std::unordered_set<Node*>> inner_pattern_remove_nodes;
+
+  GraphPatternDetector gpd2;
+  auto* pattern2 = gpd2.mutable_pattern();
+  patterns::PrelnEmbedding1Eltwise1Pattern second_pattern(
+      pattern2, name_scope + "/second");
+  second_pattern();
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out,
+                              second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING)
+          << "Pass(PrelnEmbedding1Eltwise1Pattern) in op compat failed.";
+      return;
+    }
+    auto in = std::make_pair(lookup_table1_x, lookup_table1_w);
+    inner_pattern_ins.push_back(in);
+    inner_pattern_tmp_in.push_back(eltwise_add_in);
+    inner_pattern_out.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({lookup_table1, lookup_table1_out, eltwise_add});
+    inner_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd2(graph, handler2);
+
+  std::vector<Node*> end_pattern_elt_out;
+  std::vector<Node*> end_pattern_scales;
+  std::vector<Node*> end_pattern_biases;
+  std::vector<Node*> end_pattern_out;
+  std::vector<Node*> end_patter_layernorms;
+  std::vector<Node*> end_patter_elementwise;
+  std::vector<std::unordered_set<Node*>> end_pattern_remove_nodes;
+  GraphPatternDetector gpd3;
+  auto* pattern3 = gpd3.mutable_pattern();
+  patterns::PrelnSkipLayerNorm skip_layernorm_pattern(pattern3,
+                                                      name_scope + "/third");
+  skip_layernorm_pattern();
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
+                              skip_layernorm_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(PrelnSkipLayerNorm) in op compat failed.";
+      return;
+    }
+    end_pattern_elt_out.push_back(eltwise_add_out);
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({layer_norm, layer_norm_mean, layer_norm_variance});
+    end_pattern_remove_nodes.push_back(rm_nodes);
+    end_pattern_biases.push_back(layer_norm_bias);
+    end_pattern_scales.push_back(layer_norm_scale);
+    end_pattern_out.push_back(layer_norm_out);
+    end_patter_layernorms.push_back(layer_norm);
+    end_patter_elementwise.push_back(eltwise_add);
+  };
+  gpd3(graph, handler3);
+
+  if (start_pattern_in_nodes.empty() || end_pattern_elt_out.empty()) {
+    return 0;
+  }
+  // only reserve the subgraphs that in connected domains.
+  int fusion_count = 0;
+  // fusion_id for (i, k, js)
+  std::vector<std::pair<size_t, std::pair<size_t, std::vector<size_t>>>>
+      fusion_ids;
+  for (size_t i = 0; i < start_pattern_in_nodes.size(); ++i) {
+    Node* tmp = start_pattern_out_node[i];
+    Node* old_tmp = nullptr;
+    // get correct inner pattern node order.
+    std::vector<size_t> js;
+    while (tmp != old_tmp) {
+      old_tmp = tmp;
+      for (size_t j = 0; j < inner_pattern_tmp_in.size(); ++j) {
+        if (inner_pattern_tmp_in[j] == tmp) {
+          tmp = inner_pattern_out[j];
+          js.push_back(j);
+          break;
+        }
+      }
+    }
+
+    for (size_t k = 0; k < end_pattern_elt_out.size(); ++k) {
+      if (tmp == end_pattern_elt_out[k]) {
+        fusion_ids.push_back(std::make_pair(i, std::make_pair(k, js)));
+        break;
+      }
+    }
+  }
+
+  for (size_t num = 0; num < fusion_ids.size(); ++num) {
+    int i = fusion_ids[num].first;
+    int k = fusion_ids[num].second.first;
+    std::vector<size_t> js = fusion_ids[num].second.second;
+
+    std::vector<std::string> ids;
+    std::vector<std::string> embs;
+    for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
+      ids.push_back(start_pattern_in_nodes[i][iter].first->Name());
+      embs.push_back(start_pattern_in_nodes[i][iter].second->Name());
+    }
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      ids.push_back(inner_pattern_ins[js[iter]].first->Name());
+      embs.push_back(inner_pattern_ins[js[iter]].second->Name());
+    }
+
+    OpDesc new_op_desc;
+    new_op_desc.SetType("fused_preln_embedding_eltwise_layernorm");
+    new_op_desc.SetInput("Ids", ids);
+    new_op_desc.SetInput("Embs", embs);
+    new_op_desc.SetInput("WordId", {ids[0]});
+    new_op_desc.SetInput("PosId", {ids[1]});
+    if (ids.size() > 2) {
+      new_op_desc.SetInput("SentId", {ids[2]});
+    }
+
+    new_op_desc.SetInput("WordEmbedding", {embs[0]});
+    new_op_desc.SetInput("PosEmbedding", {embs[1]});
+    if (embs.size() > 2) {
+      new_op_desc.SetInput("SentEmbedding", {embs[2]});
+    }
+
+    new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
+    new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
+    new_op_desc.SetOutput("Out_0", {end_pattern_out[k]->Name()});
+    new_op_desc.SetOutput("Out_1", {inner_pattern_out[k]->Name()});
+    new_op_desc.SetAttr("epsilon",
+                        end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold") &&
+        end_patter_elementwise[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+      new_op_desc.SetAttr(
+          "out_0_threshold",
+          end_patter_layernorms[k]->Op()->GetAttr("out_threshold"));
+      new_op_desc.SetAttr(
+          "out_1_threshold",
+          end_patter_elementwise[k]->Op()->GetAttr("out_threshold"));
+    }
+
+    auto* preln_embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
+
+    for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
+      IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].first,
+                      preln_embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].second,
+                      preln_embedding_eltwise_layernorm);
+    }
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].first,
+                      preln_embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].second,
+                      preln_embedding_eltwise_layernorm);
+    }
+    IR_NODE_LINK_TO(end_pattern_biases[k], preln_embedding_eltwise_layernorm);
+    IR_NODE_LINK_TO(end_pattern_scales[k], preln_embedding_eltwise_layernorm);
+    IR_NODE_LINK_TO(preln_embedding_eltwise_layernorm, end_pattern_out[k]);
+    IR_NODE_LINK_TO(preln_embedding_eltwise_layernorm, inner_pattern_out[k]);
+
+    // Remove unneeded nodes.
+    std::unordered_set<const Node*> marked_nodes;
+    marked_nodes.insert(start_pattern_remove_nodes[i].begin(),
+                        start_pattern_remove_nodes[i].end());
+    marked_nodes.insert(end_pattern_remove_nodes[k].begin(),
+                        end_pattern_remove_nodes[k].end());
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      marked_nodes.insert(inner_pattern_remove_nodes[js[iter]].begin(),
+                          inner_pattern_remove_nodes[js[iter]].end());
+    }
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  }
+
+  return fusion_count;
+}
+
+PrelnEmbeddingEltwiseLayerNormFusePass::
+    PrelnEmbeddingEltwiseLayerNormFusePass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+}
+
+void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  int fusion_count =
+      PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_);
+  if (fusion_count > 0) {
+    graph->Set(kPrelnEmbEltwiseLayernormPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(preln_embedding_eltwise_layernorm_fuse_pass,
+              paddle::framework::ir::PrelnEmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(preln_embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ccc6c85d4860540dfa7a74911c6633180850344
--- /dev/null
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+// detect start pattern.
+//
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct PrelnEmbedding2Eltwise1Pattern : public PatternBase {
+  PrelnEmbedding2Eltwise1Pattern(PDPattern* pattern,
+                                 const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "Prelnembedding2_eltwise1") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+// detect repeats inner pattern
+//
+//    elt_out_var            in_var   emb
+//         \                   |       |
+//          \                 lookup_table
+//           \                     |
+//            \                 lkt_var
+//             \                   /
+//                elementwise_add
+//                  |        |
+//        elementwise_add  elt_out_var
+//
+struct PrelnEmbedding1Eltwise1Pattern : public PatternBase {
+  PrelnEmbedding1Eltwise1Pattern(PDPattern* pattern,
+                                 const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "Prelnembedding1_eltwise1") {}
+  void operator()();
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(eltwise_add_in);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+// detect end pattern
+//
+//     elementwise_add
+//      |          |
+//      |       elt_out_var
+//      |      scale   |   bias
+//      |           \  |  /
+// elementwise_add  layer_norm
+//
+struct PrelnSkipLayerNorm : public PatternBase {
+  PrelnSkipLayerNorm(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "Prelnskip_layernorm") {}
+  void operator()();
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+  // Delete the mean and var nodes in the graph.
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+};
+}  // namespace patterns
+
+// The PrelnEmbeddingEltwiseLayerNormFusePass detect the following pattern:
+//
+// inputs                           operator            output
+// --------------------------------------------------------------------
+// (word, weights_0)                lookup_table     ->  word_emb
+// (pos, weights_1)                 lookup_table     ->  pos_emb
+// (sent, weights_2)                lookup_table     ->  sent_emb
+// (word_emb, pos_emb)              elementweise_add -> elementwise_out_0
+// (elemtwise_out_0, sent_emb)      elementweise_add -> elementwise_out_1
+// (elementwise_out_1, scale, bias) layer_norm       -> layer_norm_out
+//
+// and then convert the corresponding subgraph to:
+//
+// (word, pos, sent, weights_0, weights_1, weights_2,
+//       scale, baias)   Prelnembedding_eltwise_layernorm -> layer_norm_out +
+//       elementwise_add_out
+//
+//
+//  in_var  emb_var   in_var   emb_var   in_var   emb_var      in_var   emb_var
+//    |        |        |         |        |         |           |         |
+//   lookup_table      lookup_table       lookup_table   ...    lookup_table
+//        |                 |                  |                     |
+//     lkt_var           lkt_var            lkt_var               lkt_var
+//        \                 /                  |         ...         |
+//          elementwise_add                    |                     |
+//                 \                          /                      |
+//                       elementwise_add                             |
+//                               |                                   |
+//                            elt_var                               /
+//                               \                                 /
+//                                         elementwise_add
+//                                           |         |
+//                                elementwise_add    layer_norm
+
+class PrelnEmbeddingEltwiseLayerNormFusePass : public FusePassBase {
+ public:
+  PrelnEmbeddingEltwiseLayerNormFusePass();
+  virtual ~PrelnEmbeddingEltwiseLayerNormFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+  int BuildFusion(Graph* graph, const std::string& name_scope
+                  /*const Scope* scope*/) const;
+  const std::string name_scope_{"preln_embedding_eltwise_layernorm_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b7b82cbca9e86587467fa0888eca6c6fdc2e162
--- /dev/null
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct PrelnSkipLayerNorm : public PatternBase {
+  PrelnSkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "preln_skip_layernorm") {}
+
+  void operator()(PDNode *x, PDNode *y);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fused_skipe_layernorm);
+  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(layer_norm);
+  // declare variable node's name
+  PATTERN_DECL_NODE(
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
+                         // elementwise_out
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+};
+
+void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
+  // Create nodes for elementwise add op.
+  x->assert_is_op_input("elementwise_add", "X");
+  y->assert_is_op_input("elementwise_add", "Y");
+  auto *elementwise =
+      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
+  auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->assert_is_op_input("layer_norm", "X")
+                                  ->assert_is_op_input("elementwise_add", "Y");
+
+  // Add links for elementwise_add op.
+  elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
+
+  // Create nodes for layer_norm op.
+  auto *layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+
+  auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output("layer_norm", "Y");
+  auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto *layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+
+  // Add links for layer_norm op.
+  layer_norm
+      ->LinksFrom(
+          {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+}
+
+}  // namespace patterns
+
+void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init("preln_skip_layernorm_fuse", graph);
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("preln_skip_layernorm_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("elementwise_add", "X")
+                ->assert_var_not_persistable();
+  auto *y = gpd.mutable_pattern()
+                ->NewNode("preln_skip_layernorm_fuse/y")
+                ->AsInput()
+                ->assert_is_op_input("elementwise_add", "Y")
+                ->assert_var_not_persistable();
+  patterns::PrelnSkipLayerNorm fused_pattern(gpd.mutable_pattern(),
+                                             "preln_skip_layernorm_fuse");
+  fused_pattern(x, y);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (subgraph.count(x) <= 0 || subgraph.count(y) <= 0) {
+      LOG(WARNING) << "The subgraph is empty.";
+      return;
+    }
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "preln_skip_layernorm pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle PrelnSkipLayerNorm fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale,
+                              fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
+                              fused_pattern);
+
+    std::unordered_set<const Node *> del_node_set;
+
+    // Create an PrelnSkipLayerNorm op node
+    OpDesc new_desc;
+    new_desc.SetType("preln_skip_layernorm");
+
+    // inputs
+    new_desc.SetInput("X", {subgraph.at(x)->Name()});
+    new_desc.SetInput("Y", {subgraph.at(y)->Name()});
+    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
+    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+
+    if (elementwise->Op()->HasAttr("out_threshold") &&
+        layer_norm->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+      new_desc.SetAttr("out_0_threshold",
+                       layer_norm->Op()->GetAttr("out_threshold"));
+      new_desc.SetAttr("out_1_threshold",
+                       elementwise->Op()->GetAttr("out_threshold"));
+    }
+
+    // outputs
+    new_desc.SetOutput("Out_0", {layer_norm_out->Name()});
+    new_desc.SetOutput("Out_1", {elementwise_out->Name()});
+
+    // attrs
+    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("begin_norm_axis",
+                     layer_norm->Op()->GetAttr("begin_norm_axis"));
+
+    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+
+    del_node_set.insert(elementwise);
+    del_node_set.insert(layer_norm);
+    del_node_set.insert(layer_norm_mean);
+    del_node_set.insert(layer_norm_variance);
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
+    IR_NODE_LINK_TO(subgraph.at(y), fused_node);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_node);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_node);
+    IR_NODE_LINK_TO(fused_node, layer_norm_out);
+    IR_NODE_LINK_TO(fused_node, elementwise_out);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(preln_skip_layernorm_fuse_pass,
+              paddle::framework::ir::PrelnSkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(preln_skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..52447bfd8d3f1b8cb56080d8fd753a559477c783
--- /dev/null
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+//     |           |                            |            |
+// other_op1   other_op2                    other_op1    other_op2
+//     |           |              fuse           \          /
+//     |------elementwise_add      ->           skip_layernorm
+//             |          |                        |      |
+//        other_op4    layer_norm            other_op4  other_op3
+//                       |
+//                   other_op3
+class Graph;
+
+class PrelnSkipLayerNormFusePass : public FusePassBase {
+ public:
+  PrelnSkipLayerNormFusePass() {
+    AddOpCompat(OpCompat("elementwise_add"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Out")
+        .IsTensor()
+        .End()
+        .AddAttr("axis")
+        .IsIntIn({0, -1})
+        .End();
+
+    AddOpCompat(OpCompat("layer_norm"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Scale")
+        .IsTensor()
+        .End()
+        .AddInput("Bias")
+        .IsTensor()
+        .End()
+        .AddOutput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Mean")
+        .IsTensor()
+        .End()
+        .AddOutput("Variance")
+        .IsTensor()
+        .End()
+        .AddAttr("epsilon")
+        .IsNumGE(0.0f)
+        .IsNumLE(0.001f)
+        .End()
+        .AddAttr("begin_norm_axis")
+        .IsNumGT(0)
+        .End();
+  }
+
+  virtual ~PrelnSkipLayerNormFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
index c917630666b082ab7148550707f9f1f720aa25d3..2f3c3f3d06e327bc583c817bdfcc78345d8adff5 100644
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -67,4 +67,4 @@ TEST(NaiveExecutor, Basic) {
 }  // namespace framework
 }  // namespace paddle
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index b42f2da2a4d78b2913aedd01172771ce51926a2a..a0708f28e37ee2088d82f1b73b79f1452dc0f262 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -25,12 +25,12 @@ USE_OP(fill_constant);
 USE_OP(uniform_random);
 USE_OP(lookup_table);
 USE_OP(transpose2);
-USE_OP(reshape2);
+USE_OP_ITSELF(reshape2);
 USE_OP(split);
 USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(elementwise_mul);
@@ -39,9 +39,9 @@ USE_OP(reduce_mean);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
-USE_OP(reshape2_grad);
+USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
 USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 7dac6a092d245fab3781c0af0bb6d4162b5be47c..9d1f09869988df96205cad5cc29aba8ea7edd945 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
                         "Too many OpKernel attribute values, expected maximum "
                         "value is 64, received value is %d.",
                         cur_loc));
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::hash<int> hasher;
+  size_t seed =
+      hasher(place + data_type + data_layout + library_type + customized_value);
+  if (platform::is_custom_place(key.place_)) {
+    seed ^= std::hash<std::string>{}(key.place_.GetDeviceType()) + 0x9e3779b9 +
+            (seed << 6) + (seed >> 2) + 4;
+  }
+  return seed;
+#else
   std::hash<int> hasher;
   return hasher(place + data_type + data_layout + library_type +
                 customized_value);
+#endif
 }
 
 bool OpKernelType::operator==(const OpKernelType& o) const {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4670f043102d917f770b6fa5ca661a860941df33..7ab4e2acecfccd913343fc453338a26ddd9c92dd 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/scalar.h"
@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = place.device;
       platform::SetMLUDeviceId(dev_id);
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CustomDevice support.",
+          place));
+#else
+      platform::DeviceManager::SetDevice(place);
 #endif
     }
 
@@ -1326,8 +1336,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
 OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
     const ExecutionContext& ctx) const {
-  auto& dev_ctx = ctx.device_context();
-
   auto expected_kernel_key = this->GetExpectedKernelType(ctx);
   if (HasAttr("op_device")) {
     if (Attr<std::string>("op_device") == "cpu") {
@@ -1344,12 +1352,20 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       }
       // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
       // will be executed and a warning will be given at the same time.
+      expected_kernel_key.place_ = platform::CPUPlace();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (SupportGPU()) {
+        auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
-      } else if (SupportNPU()) {
+      }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+      if (SupportNPU()) {
+        auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
-      } else {
-        expected_kernel_key.place_ = platform::CPUPlace();
+      }
+#endif
+      if (platform::is_cpu_place(expected_kernel_key.place_)) {
         LOG_FIRST_N(WARNING, 1)
             << "Op(" << type_
             << ") has no CUDA implementation. It will be assigned to CPUPlace.";
@@ -1924,12 +1940,10 @@ Scope* OperatorWithKernel::PreparePtenData(
 
   for (size_t i = 0; i < input_defs.size(); ++i) {
     auto& in_def = input_defs.at(i);
-    auto it = ctx->inputs.find(input_names[i]);
-    if (it == ctx->inputs.end()) {
+    if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) {
       continue;
     }
-
-    auto& ins_vector = it->second;
+    auto& ins_vector = ctx->inputs.at(input_names[i]);
     auto& name_vec = name_map.at(input_names[i]);
     bool should_skip_input =
         no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
@@ -1940,7 +1954,6 @@ Scope* OperatorWithKernel::PreparePtenData(
       if (var == nullptr || !VarIsTensor(*var)) {
         continue;
       }
-
       auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
 
       // When no_buffer_ins then checking of Tensor::holder_ is
@@ -2165,6 +2178,8 @@ void OperatorWithKernel::BuildPtenKernelContext(
         pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
         pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bca6a0a4cb8e0d61574f2b7be00e1f67b70ec035..79e6da987ef09db5ed43dfb8168dd13fa0cf885e 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -661,6 +661,6 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(relu_grad);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 535c9ab58e295fae2048bb162adfb0384745d0ae..c62ece7f0dccc2612b6b53371805d29375416772 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -88,7 +88,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
       if (cache_by_struct_.count(cur_key_by_struct) != 0) {
         exist = true;
         cache_by_address_[cur_key_by_address] =
-            cache_by_struct_.at(cur_key_by_struct).get();
+            cache_by_struct_.at(cur_key_by_struct);
       }
     }
   }
@@ -98,12 +98,13 @@ const CinnCompiledObject& CinnCompiler::Compile(
         CompileGraph(graph, input_tensors, target, compiled_num, stream);
     pten::AutoWRLock w_guard{&rwlock_};
     if (!cache_by_struct_.count(cur_key_by_struct)) {
-      cache_by_address_[cur_key_by_address] = compiled_res.get();
-      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
+      cache_by_address_[cur_key_by_address] = compiled_num;
+      cache_by_struct_[cur_key_by_struct] = compiled_num;
+      index2cache_.emplace(compiled_num, std::move(compiled_res));
     }
   }
   pten::AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
+  const auto& cached_boj = *index2cache_[cache_by_address_[cur_key_by_address]];
   return cached_boj;
 }
 
@@ -115,6 +116,15 @@ const CinnCompiledObject& CinnCompiler::Compile(
   return Compile(graph, input_tensors, target, stream);
 }
 
+const CinnCompiledObject& CinnCompiler::GetCompiledObject(
+    int64_t cached_index) const {
+  auto res = index2cache_.find(cached_index);
+  PADDLE_ENFORCE_NE(res, index2cache_.end(),
+                    platform::errors::InvalidArgument(
+                        "Index(%ld) not found in cache", cached_index));
+  return *res->second;
+}
+
 std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
   std::string graph_key;
   ProgramDesc program;
@@ -202,6 +212,7 @@ void CinnCompiler::Clear() {
     graphs_.clear();
     cache_by_address_.clear();
     cache_by_struct_.clear();
+    index2cache_.clear();
   }
   real_compiled_num_.store(0);
 }
@@ -240,6 +251,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   compiled_obj->launch_context =
       std::make_unique<operators::details::CinnLaunchContext>(
           compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
+  compiled_obj->cached_index = compiled_num;
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 91a7b4e5a11f0054112df9645c4f8b8f3c22501b..d7ae743111ea73fe9d931a79e89cb08a406b60ce 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -53,6 +53,7 @@ struct CinnCompiledObject {
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
   std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
+  std::int64_t cached_index;
 };
 
 // Entrance to use CINN.
@@ -76,6 +77,8 @@ class CinnCompiler {
       const std::map<std::string, const LoDTensor*>& input_tensors,
       const ::cinn::common::Target& target, void* stream = nullptr);
 
+  const CinnCompiledObject& GetCompiledObject(int64_t cached_index) const;
+
   std::string AddGraph(std::unique_ptr<ir::Graph> graph);
 
   const ir::Graph& FindGraph(const std::string& graph_key) const;
@@ -101,12 +104,12 @@ class CinnCompiler {
       void* stream = nullptr) const;
 
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
-                     CinnCacheKey::Hash>
+  std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
       cache_by_address_;
-  std::unordered_map<CinnCacheKeyByStructure,
-                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+  std::unordered_map<CinnCacheKeyByStructure, std::int64_t, CinnCacheKey::Hash>
       cache_by_struct_;
+  std::unordered_map<std::int64_t, std::unique_ptr<CinnCompiledObject>>
+      index2cache_;
   std::atomic_int64_t real_compiled_num_{0};
   mutable pten::RWLock rwlock_;
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 6769413d99bafd7a26a3486da6928d06ad920ace..05cd9e8a2e8a0d9fb533d9b92b7e1c9d7742629b 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -270,13 +270,20 @@ TEST(CinnCompilerTest, Compile) {
   auto compile_fn = [&](const Target& target) {
     const auto& compiled_obj =
         cinn_compiler->Compile(compiling_graph, input_tensors, target);
+    ASSERT_NE(compiled_obj.compiler, nullptr);
     ASSERT_NE(compiled_obj.runtime_program, nullptr);
     ASSERT_NE(compiled_obj.scope, nullptr);
     ASSERT_FALSE(compiled_obj.paddle2cinn_varmap.empty());
+    ASSERT_NE(compiled_obj.launch_context, nullptr);
     const auto& cached_obj =
         cinn_compiler->Compile(compilation_key, input_tensors, target);
     ASSERT_EQ(reinterpret_cast<std::uint64_t>(&compiled_obj),
               reinterpret_cast<std::uint64_t>(&cached_obj));
+    ASSERT_EQ(cached_obj.cached_index + 1, cinn_compiler->real_compiled_num());
+    const auto& ret_obj =
+        cinn_compiler->GetCompiledObject(cached_obj.cached_index);
+    ASSERT_EQ(reinterpret_cast<std::uint64_t>(&compiled_obj),
+              reinterpret_cast<std::uint64_t>(&ret_obj));
   };
 
   // GPU Compilation
@@ -295,4 +302,4 @@ USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index aed5e2c7405ac0782ef3d9438b4958432584525a..1a826f6bdd5e7344d9983c026fc2d4cc8812d15a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use XPU device since it's not compiled with XPU,"
           "Please recompile or reinstall Paddle with XPU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(
+            new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size));
+      } else {
+        gc.reset(new CustomStreamGarbageCollector(place, max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use custom device since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
     } else if (platform::is_cpu_place(place)) {
       gc.reset(new CPUGarbageCollector(place, max_memory_size));
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index ea2e62d89f63da2bfe7e49c34e8aecad4e6138e0..2d2cc30497e288046256af5564620d40913cf3bf 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -186,8 +186,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }
 
 std::once_flag kernel_sig_map_init_flag;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 844b5d82695009415815eaba819cf6a8bf5a89e3..e510257c6106b8d3540e927f0e6fd76a9e73ea09 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_custom_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
-      platform::is_mlu_place(dst_place)) {
+      platform::is_mlu_place(dst_place) ||
+      platform::is_custom_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
   } else {
     dev_ctx = pool.Get(src.place());
@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&    // NOLINT
+           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_custom_place(
+               dst_place)) { /* custom_device -> custom_device*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
                  const platform::CUDAPinnedPlace& cpu) const {
     return *out.data<bool>();
   }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
+                                                 custom_dev));
+    return false;
+  }
 };
 
 template <typename Predicate>
@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> {
       out_ptr[i] = lhs && rhs;
     }
   }
+
+  void VisitorImpl(const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("CustomPlace is not supported"));
+  }
 };
 
 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported when not compiled with NPU"));
+#endif
+    } else if (platform::is_custom_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& custom_device_context =
+          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(), tensor.place(),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     custom_device_context.stream());
+        custom_device_context.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CustomPlace is not supported when not compiled with "
+          "CustomDevice"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
         dev_ctx.Wait();
       }
 #else
@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
         dev_ctx.Wait();
       }
 #else
@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "MLUPlace is not supported when not compiled with MLU"));
-      } else {
+      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CutomPlace is not supported when not compiled with CustomDevice"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index bcaf3c719cb720d76c78a2b15475652eda793cad..1c1a86f1d32d3c3553e2201432453e5e2fdaa1e3 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromArray on %s is not supported.", dst_place));
+  }
 }
 
 template <typename T>
@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
 }
 
 // The fully specialized function should be inline to avoid
@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
   delete[] array;
 }
 
@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorToVector on %s is not supported.", src.place()));
+  }
 }
 
 template <>
@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src,
         dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
 #endif
   for (unsigned int i = 0; i < src.numel(); i++) {
     (*dst)[i] = static_cast<bool>(array[i]);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 89d9324039c15cecd8ba1518aae3645e2f540f9d..90cf0e76e000736f730121a6fcce841aa38a59ae 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -44,9 +44,9 @@ if(WITH_GLOO)
 endif()
 
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor)
 else()
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor)
 endif()
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 0913d54c8359aa48a1fd5213b87ddf632dc595d9..547fa02326bec36858717c8f66a268551423dbaa 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -340,8 +340,8 @@ NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
 }
 template NameVarMap<VarBase> AutoCastInputs<VarBase>(
     const std::string& op_type, const NameVarMap<VarBase>& ins);
-template NameVarMap<egr::EagerTensor> AutoCastInputs<egr::EagerTensor>(
-    const std::string& op_type, const NameVarMap<egr::EagerTensor>& ins);
+template NameVarMap<egr::EagerVariable> AutoCastInputs<egr::EagerVariable>(
+    const std::string& op_type, const NameVarMap<egr::EagerVariable>& ins);
 template <typename VarType>
 NameVarMap<VarType> CastPureFp16Inputs(const std::string& op_type,
                                        const NameVarMap<VarType>& ins) {
@@ -384,7 +384,7 @@ NameVarMap<VarType> CastPureFp16Inputs(const std::string& op_type,
 }
 template NameVarMap<VarBase> CastPureFp16Inputs<VarBase>(
     const std::string& op_type, const NameVarMap<VarBase>& ins);
-template NameVarMap<egr::EagerTensor> CastPureFp16Inputs<egr::EagerTensor>(
-    const std::string& op_type, const NameVarMap<egr::EagerTensor>& ins);
+template NameVarMap<egr::EagerVariable> CastPureFp16Inputs<egr::EagerVariable>(
+    const std::string& op_type, const NameVarMap<egr::EagerVariable>& ins);
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index d57cb696526b490fac9d2610320ede8eef665d4f..17ab1f1f7c53fe69e07e04df4f98baaaf10d615f 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -35,6 +35,9 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 
 namespace paddle {
 namespace imperative {
@@ -180,6 +183,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
         "is not supported in imperative mode",
         place));
   }
+  void operator()(const platform::CustomPlace& place) const {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
 
  private:
   int64_t numel_;
@@ -243,6 +252,23 @@ TType& GetInnerTensor(const paddle::experimental::Tensor& src) {
   return *src_tensor;
 }
 
+template <typename TType>
+TType* GetEmptyInnerTensor(paddle::experimental::Tensor* dst) {
+  PADDLE_ENFORCE_EQ(
+      dst->defined(), false,
+      platform::errors::Fatal(
+          "The underlying Tensor implementation should be nullptr"));
+  dst->set_impl(std::make_shared<TType>());
+  auto* dst_tensor = static_cast<TType*>(dst->impl().get());
+  return dst_tensor;
+}
+
+template <typename TType>
+TType* GetEmptyInnerTensor(paddle::imperative::VariableWrapper* dst) {
+  auto* dst_tensor = dst->MutableVar()->GetMutable<TType>();
+  return dst_tensor;
+}
+
 template <typename VarType>
 void TensorAdd(const VarType& src, VarType* dst) {
   pten::DenseTensor* dst_tensor = GetInnerMutableTensor<pten::DenseTensor>(dst);
@@ -314,7 +340,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
     return;
   }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (platform::is_custom_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Gradient accumulation of data type (%s) on place (%s) is not "
+        "supported in imperative mode",
+        framework::DataTypeToString(data_type), place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   if (platform::is_xpu_place(place)) {
     if (data_type == framework::DataTypeTrait<float>::DataType()) {
@@ -332,6 +365,35 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 #endif
 
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(place)) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::DeviceContext* ctx = pool.Get(place);
+    auto dev_ctx = dynamic_cast<platform::MLUDeviceContext*>(ctx);
+    if (data_type == framework::DataTypeTrait<float>::DataType()) {
+      dst_tensor->mutable_data<float>(place);
+    } else if (data_type ==
+               framework::DataTypeTrait<platform::float16>::DataType()) {
+      dst_tensor->mutable_data<platform::float16>(place);
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Gradient accumulation of data type (%s) on place (%s) is not "
+          "supported in imperative mode",
+          framework::DataTypeToString(data_type), place));
+    }
+    static const float alpha = 1.f;
+    static const float beta = 1.f;
+    operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor);
+    operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor);
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd(
+        dev_ctx->cnnl_handle(), static_cast<void*>(&alpha),
+        src_tensor_desc.get(), operators::GetBasePtr(&src_tensor), nullptr, 0,
+        static_cast<void*>(&beta), dst_tensor_desc.get(),
+        operators::GetBasePtr(dst_tensor)));
+    return;
+  }
+#endif
+
   PADDLE_TENSOR_ADD(float);
 
 #ifndef PADDLE_WITH_XPU
@@ -473,13 +535,14 @@ template void SelectedRowsAddTensor(
 // Note(chenweihang): when two selected rows need to be added,
 //   adding one to another is not equal to merging two selected rows
 //   to one then add it to a empty selected rows, the after is correct
-// Note(chenweihang): when two selected rows need to be added,
-//   adding one to another is not equal to merging two selected rows
-//   to one then add it to a empty selected rows, the after is correct
-std::shared_ptr<VariableWrapper> SelectedRowsMerge(
-    const framework::Variable& src1, const framework::Variable& src2) {
-  auto& src_selected_rows1 = src1.Get<pten::SelectedRows>();
-  auto& src_selected_rows2 = src2.Get<pten::SelectedRows>();
+template <typename ReturnVarType, typename VarType>
+std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
+                                                 const VarType& src2) {
+  const pten::SelectedRows& src_selected_rows1 =
+      GetInnerTensor<pten::SelectedRows>(src1);
+  const pten::SelectedRows& src_selected_rows2 =
+      GetInnerTensor<pten::SelectedRows>(src2);
+
   auto place = src_selected_rows1.value().place();
   auto data_type =
       framework::TransToProtoVarType(src_selected_rows1.value().dtype());
@@ -488,9 +551,10 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
   std::vector<const pten::SelectedRows*> src_selected_rows;
   src_selected_rows.emplace_back(&src_selected_rows1);
   src_selected_rows.emplace_back(&src_selected_rows2);
-  auto dst_var = std::make_shared<VariableWrapper>("Temp");
-  auto* dst_selected_rows =
-      dst_var->MutableVar()->GetMutable<pten::SelectedRows>();
+
+  auto dst_var = std::make_shared<ReturnVarType>("Temp");
+  pten::SelectedRows* dst_selected_rows =
+      GetEmptyInnerTensor<pten::SelectedRows>(dst_var.get());
 
 #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type)                  \
   if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) {      \
@@ -515,12 +579,17 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
 #endif
 
 #undef PADDLE_SELECTED_ROWS_ADD
-
   PADDLE_THROW(platform::errors::InvalidArgument(
       "Not supported data type %s for SelectedRowsMerge",
       framework::DataTypeToString(data_type)));
 }
 
+template std::shared_ptr<paddle::experimental::Tensor> SelectedRowsMerge(
+    const paddle::experimental::Tensor& src1,
+    const paddle::experimental::Tensor& src2);
+template std::shared_ptr<paddle::imperative::VariableWrapper> SelectedRowsMerge(
+    const framework::Variable& src1, const framework::Variable& src2);
+
 void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
                         VariableWrapper* dst_var, bool unchange_input) {
   auto& src = var->Var();
@@ -547,7 +616,7 @@ void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
         *dst = std::move(*(var->MutableVar()));
       }
     } else if (src.IsType<pten::SelectedRows>()) {
-      auto temp = SelectedRowsMerge(src, *dst);
+      auto temp = SelectedRowsMerge<VariableWrapper>(src, *dst);
       *dst = std::move(*(temp->MutableVar()));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -603,7 +672,7 @@ void GradientAccumulator::AccumulateGrad() {
         SelectedRowsAddToTensor(*dst, src);
         *dst = std::move(*src);
       } else if (src->IsType<pten::SelectedRows>()) {
-        auto temp = SelectedRowsMerge(*src, *dst);
+        auto temp = SelectedRowsMerge<VariableWrapper>(*src, *dst);
         *dst = std::move(*(temp->MutableVar()));
       }
     } else {
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 6371f64fe61044d6cc9ea8a10e5dbcacd3d187e4..ee2df582e81ee5cefe1faf9f3700b91c6adae434 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -17,10 +17,10 @@
 #include <memory>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/pten/api/include/tensor.h"
 
 namespace paddle {
 namespace imperative {
@@ -164,6 +164,10 @@ class SortedGradientAccumulator : public GradientAccumulator {
   std::vector<SavedVarInfo> tmp_grad_vars_;
 };
 
+template <typename ReturnVarType, typename VarType>
+std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
+                                                 const VarType& src2);
+
 template <typename VarType>
 void SelectedRowsAddToTensor(const VarType& src, VarType* dst);
 
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index b8c423f77bd235693f8bbf90a00630a8c855e00f..ed455b7fd0314e6d1e5cd38107568d5f8e89f84d 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -177,9 +177,9 @@ std::string LayerDebugString(const std::string& op_type,
 }
 
 std::string LayerDebugString(const std::string& op_type,
-                             const NameVarMap<egr::EagerTensor>& ins,
-                             const NameVarMap<egr::EagerTensor>& outs) {
-  return LayerDebugStringImpl<egr::EagerTensor>(op_type, ins, outs);
+                             const NameVarMap<egr::EagerVariable>& ins,
+                             const NameVarMap<egr::EagerVariable>& outs) {
+  return LayerDebugStringImpl<egr::EagerVariable>(op_type, ins, outs);
 }
 
 template <typename VarType>
@@ -194,11 +194,16 @@ static void SetForwardDataTypeOfGradVars(const NameVarMap<VarType>& outs) {
   }
 }
 template <>
-void SetForwardDataTypeOfGradVars<egr::EagerTensor>(
-    const NameVarMap<egr::EagerTensor>& outs) {
+void SetForwardDataTypeOfGradVars<egr::EagerVariable>(
+    const NameVarMap<egr::EagerVariable>& outs) {
   // In eager mode we don't need this.
 }
 
+void TestSetForwardDataTypeOfGradVarsEager(
+    const NameVarMap<egr::EagerVariable>& outs) {
+  SetForwardDataTypeOfGradVars<egr::EagerVariable>(outs);
+}
+
 VarBase::VarBase(const std::shared_ptr<VariableWrapper>& var)
     : var_(var), grad_node_(var->GetGradNode()) {
   if (auto grad_var = var_->GetGradVar()) {
@@ -528,12 +533,12 @@ void OpBase::Run(const framework::OperatorBase& op,
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
-                 const NameVarMap<egr::EagerTensor>& ins,
-                 const NameVarMap<egr::EagerTensor>& outs,
+                 const NameVarMap<egr::EagerVariable>& ins,
+                 const NameVarMap<egr::EagerVariable>& outs,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<egr::EagerTensor>(op, ins, outs, attrs, default_attrs, place);
+  OpBaseRunImpl<egr::EagerVariable>(op, ins, outs, attrs, default_attrs, place);
 }
 
 void ClearNoNeedBufferInputs(OpBase* op) {
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 58c77d0f4b6b7b7328b5d877f5a97410728ce39e..21167605d46029d2eb9d1ea3241f8d868a6a8344 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -185,8 +185,8 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
   static void Run(const framework::OperatorBase& op,
-                  const NameVarMap<egr::EagerTensor>& ins,
-                  const NameVarMap<egr::EagerTensor>& outs,
+                  const NameVarMap<egr::EagerVariable>& ins,
+                  const NameVarMap<egr::EagerVariable>& outs,
                   const framework::AttributeMap& attrs,
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c8ff561f7af3ad85d74eb7723b092a2a9aeaae64..c56f82d0bc08429afa288bf24cd59d264af3e2ce 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -89,11 +89,16 @@ void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
 }
 
 template <>
-void HandleComplexGradToRealGrad<egr::EagerTensor>(
-    const NameVarMap<egr::EagerTensor>& outs) {
+void HandleComplexGradToRealGrad<egr::EagerVariable>(
+    const NameVarMap<egr::EagerVariable>& outs) {
   // TODO(jiabin): Support Complex here.
 }
 
+void TestHandleComplexGradToRealGradEager(
+    const NameVarMap<egr::EagerVariable>& outs) {
+  HandleComplexGradToRealGrad<egr::EagerVariable>(outs);
+}
+
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::RuntimeContext& ctx,
                        const framework::OpKernelType& kernel_type,
@@ -278,6 +283,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (kernel_iter == kernels.end() &&
+      paddle::platform::is_custom_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
   // case
@@ -312,14 +327,14 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
-PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerTensor>& ins,
-                               const NameVarMap<egr::EagerTensor>& outs,
+PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerVariable>& ins,
+                               const NameVarMap<egr::EagerVariable>& outs,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<egr::EagerTensor>(ins, outs, op, place, attrs,
-                                       default_attrs);
+  return PrepareImpl<egr::EagerVariable>(ins, outs, op, place, attrs,
+                                         default_attrs);
 }
 template <typename VarType>
 static void PreparedOpRunImpl(
@@ -451,18 +466,18 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
   }
 }
 
-void PreparedOp::Run(const NameVarMap<egr::EagerTensor>& ins,
-                     const NameVarMap<egr::EagerTensor>& outs,
+void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
+                     const NameVarMap<egr::EagerVariable>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
-    PreparedOpRunPtImpl<egr::EagerTensor>(
+    PreparedOpRunPtImpl<egr::EagerVariable>(
         op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
         outs, attrs, default_attrs);
   } else {
-    PreparedOpRunImpl<egr::EagerTensor>(op_, ctx_, kernel_type_, func_,
-                                        dev_ctx_, ins, outs, attrs,
-                                        default_attrs);
+    PreparedOpRunImpl<egr::EagerVariable>(op_, ctx_, kernel_type_, func_,
+                                          dev_ctx_, ins, outs, attrs,
+                                          default_attrs);
   }
 }
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d5dc53196dd7f1abe854785e0e5c1ccd363d1c3f..a6b80e0d4e1927a8012ff90d54ef71857d504fc6 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -63,8 +63,8 @@ void SetForwardDataTypeOfGradVar<VarBase>(const std::shared_ptr<VarBase>& var) {
 }
 
 template <>
-void SetForwardDataTypeOfGradVar<egr::EagerTensor>(
-    const std::shared_ptr<egr::EagerTensor>& var) {
+void SetForwardDataTypeOfGradVar<egr::EagerVariable>(
+    const std::shared_ptr<egr::EagerVariable>& var) {
   VLOG(10) << "Var in Eager dose not support SetForwardDataTypeOfGradVar: "
            << var->name();
   // TODO(jiabin): SetForwardDataType of Grad var is not supported yet in
@@ -171,8 +171,8 @@ class PreparedOp {
                             const framework::AttributeMap& attrs,
                             const framework::AttributeMap& default_attrs);
 
-  static PreparedOp Prepare(const NameVarMap<egr::EagerTensor>& ins,
-                            const NameVarMap<egr::EagerTensor>& outs,
+  static PreparedOp Prepare(const NameVarMap<egr::EagerVariable>& ins,
+                            const NameVarMap<egr::EagerVariable>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
@@ -187,8 +187,8 @@ class PreparedOp {
            const framework::AttributeMap& attrs,
            const framework::AttributeMap& default_attrs);
 
-  void Run(const NameVarMap<egr::EagerTensor>& ins,
-           const NameVarMap<egr::EagerTensor>& outs,
+  void Run(const NameVarMap<egr::EagerVariable>& ins,
+           const NameVarMap<egr::EagerVariable>& outs,
            const framework::AttributeMap& attrs,
            const framework::AttributeMap& default_attrs);
 
@@ -270,26 +270,26 @@ void BuildDygraphPtenKernelContext(
       kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
-    } else {
-      auto ins_vector = it->second;
-      size_t end_idx = start_idx + ins_vector.size();
-
-      for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-        const pten::TensorBase* tensor_in = nullptr;
-        auto& var = ins_vector[offset]->Var();
-        if (var.template IsType<pten::DenseTensor>()) {
-          tensor_in = &(var.template Get<pten::DenseTensor>());
-        } else if (var.template IsType<pten::SelectedRows>()) {
-          tensor_in = &(var.template Get<pten::SelectedRows>());
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported input `%s` type when call pt kernel.",
-              framework::ToTypeName(var.Type())));
-        }
-        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      continue;
+    }
+    auto ins_vector = it->second;
+    size_t end_idx = start_idx + ins_vector.size();
+
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      const pten::TensorBase* tensor_in = nullptr;
+      auto& var = ins_vector[offset]->Var();
+      if (var.template IsType<pten::DenseTensor>()) {
+        tensor_in = &(var.template Get<pten::DenseTensor>());
+      } else if (var.template IsType<pten::SelectedRows>()) {
+        tensor_in = &(var.template Get<pten::SelectedRows>());
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported input `%s` type when call pt kernel.",
+            framework::ToTypeName(var.Type())));
       }
-      kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
+    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -421,6 +421,8 @@ void BuildDygraphPtenKernelContext(
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
@@ -466,8 +468,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel,
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto& in_def = input_defs.at(i);
-    auto it = ins.find(input_names[i]);
-    if (it == ins.end()) {
+    if (ins.find(input_names[i]) == ins.end()) {
       continue;
     }
     auto& ins_vector = ins.at(input_names[i]);
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 56eb47a2ef1719d3aad9eb10a47a46d06d0866d5..774bb9653e2cba5c27f9037ee905e70175375339 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
 endif(WIN32)
 
 
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function)
+cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils)
 cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
index d34cb924d566322a4d37555a64281688ae8a116d..57a2149b23c1bef678bc262d1bb009ed6cfeb572 100644
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -31,8 +31,8 @@
 namespace paddle {
 namespace imperative {
 extern std::string LayerDebugString(const std::string& op_type,
-                                    const NameVarMap<egr::EagerTensor>& ins,
-                                    const NameVarMap<egr::EagerTensor>& outs);
+                                    const NameVarMap<egr::EagerVariable>& ins,
+                                    const NameVarMap<egr::EagerVariable>& outs);
 
 extern std::shared_ptr<GradOpNode> CreateGradOpNode(
     const framework::OperatorBase& op, const NameTensorMap& ins,
@@ -41,20 +41,21 @@ extern std::shared_ptr<GradOpNode> CreateGradOpNode(
     const std::map<std::string, std::string>& inplace_map);
 
 TEST(test_eager, eager_debug) {
-  std::shared_ptr<egr::EagerTensor> x_in(new egr::EagerTensor("x_in"));
-  std::shared_ptr<egr::EagerTensor> y_in(new egr::EagerTensor("y_in"));
-  std::shared_ptr<egr::EagerTensor> vout(new egr::EagerTensor("vout"));
-  imperative::NameVarMap<egr::EagerTensor> ins = {{"X", {x_in}}, {"Y", {y_in}}};
-  imperative::NameVarMap<egr::EagerTensor> outs = {{"Out", {vout}}};
+  std::shared_ptr<egr::EagerVariable> x_in(new egr::EagerVariable("x_in"));
+  std::shared_ptr<egr::EagerVariable> y_in(new egr::EagerVariable("y_in"));
+  std::shared_ptr<egr::EagerVariable> vout(new egr::EagerVariable("vout"));
+  imperative::NameVarMap<egr::EagerVariable> ins = {{"X", {x_in}},
+                                                    {"Y", {y_in}}};
+  imperative::NameVarMap<egr::EagerVariable> outs = {{"Out", {vout}}};
   LayerDebugString("mul", ins, outs);
 }
 TEST(test_create_node, eager_node) {
   auto op = framework::OpRegistry::CreateOp("mul", {}, {}, {}, false);
   framework::Scope scope;
   auto ctx = framework::RuntimeContext({}, {});
-  imperative::NameVarMap<egr::EagerTensor> ins = {{"X", {nullptr}},
-                                                  {"Y", {nullptr}}};
-  imperative::NameVarMap<egr::EagerTensor> outs = {{"Out", {nullptr}}};
+  imperative::NameVarMap<egr::EagerVariable> ins = {{"X", {nullptr}},
+                                                    {"Y", {nullptr}}};
+  imperative::NameVarMap<egr::EagerVariable> outs = {{"Out", {nullptr}}};
   CreateGradOpNode((*op.get()), ins, outs, framework::AttributeMap{},
                    framework::AttributeMap{}, platform::CPUPlace(), {});
 }
@@ -72,26 +73,26 @@ TEST(test_var_helper, eager_var_helper) {
   ASSERT_ANY_THROW(
       InitializeVariable(&var8, paddle::framework::proto::VarType::FP64));
 
-  auto egr_tensor = std::make_shared<egr::EagerTensor>();
-  auto egr_tensor2 = std::make_shared<egr::EagerTensor>();
+  auto egr_tensor = std::make_shared<egr::EagerVariable>();
+  auto egr_tensor2 = std::make_shared<egr::EagerVariable>();
   egr_tensor->MutableVar()
       ->GetMutable<pten::SelectedRows>()
       ->mutable_value()
       ->mutable_data<float>(platform::CPUPlace());
   egr_tensor2->MutableVar()->GetMutable<framework::LoDRankTable>();
   VLOG(6) << "egr_tensor create with ";
-  ASSERT_TRUE(platform::is_cpu_place(GetPlace<egr::EagerTensor>(egr_tensor)));
-  ASSERT_TRUE(GetDataType<egr::EagerTensor>(egr_tensor) ==
+  ASSERT_TRUE(platform::is_cpu_place(GetPlace<egr::EagerVariable>(egr_tensor)));
+  ASSERT_TRUE(GetDataType<egr::EagerVariable>(egr_tensor) ==
               framework::proto::VarType::FP32);
-  GetCachedValue<egr::EagerTensor>(
+  GetCachedValue<egr::EagerVariable>(
       egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
                                           platform::CPUPlace()));
-  SetCachedValue<egr::EagerTensor>(
+  SetCachedValue<egr::EagerVariable>(
       egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
                                           platform::CPUPlace()),
       egr_tensor2);
-  ASSERT_ANY_THROW(GetPlace<egr::EagerTensor>(egr_tensor2));
-  ASSERT_ANY_THROW(SetType<egr::EagerTensor>(
+  ASSERT_ANY_THROW(GetPlace<egr::EagerVariable>(egr_tensor2));
+  ASSERT_ANY_THROW(SetType<egr::EagerVariable>(
       egr_tensor, paddle::framework::proto::VarType::LOD_TENSOR_ARRAY));
 }
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 584f8ead3d8de40ed296da9e2f99845b9e7e5d3c..4dfc8198064e376edf55df9b4c51031344f71485 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -29,6 +29,57 @@ namespace framework = paddle::framework;
 namespace paddle {
 namespace imperative {
 
+TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) {
+  pten::CPUPlace cpu;
+
+  std::vector<int64_t> rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  int64_t table_size = 10;
+  int64_t embedding_width = 10;
+
+  auto sr1 = std::make_shared<pten::SelectedRows>(rows, table_size);
+  auto sr2 = std::make_shared<pten::SelectedRows>(rows, table_size);
+
+  // initialize a sparse table 1
+  sr1->mutable_value()->Resize(
+      pten::framework::make_ddim({table_size, embedding_width}));
+  auto* data_sr1 = sr1->mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data_sr1[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+
+  // initialize a sparse table 2
+  sr2->mutable_value()->Resize(
+      pten::framework::make_ddim({table_size, embedding_width}));
+  auto* data_sr2 = sr2->mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data_sr2[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  // new 2 pten::Tensor
+  paddle::experimental::Tensor t1(sr1);
+  paddle::experimental::Tensor t2(sr2);
+
+  // call SelectedRowsMerge
+  auto new_buffer =
+      paddle::imperative::SelectedRowsMerge<paddle::experimental::Tensor>(t1,
+                                                                          t2);
+  auto* new_buffer_tensor =
+      static_cast<pten::SelectedRows*>(new_buffer->impl().get());
+  auto* new_buffer_data_sr1 =
+      new_buffer_tensor->mutable_value()->mutable_data<float>(cpu);
+
+  // verify the MergeAdd result
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      EXPECT_EQ(new_buffer_data_sr1[i * embedding_width + j],
+                (static_cast<float>(i) + static_cast<float>(i)));
+    }
+  }
+}
+
 template <typename Place1, typename Place2, typename T>
 int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
   framework::Variable var1;
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 3a0bb7c52bfe2eabb9e769cfd6c8d436df4a87e3..c99dbf1cf6258dd3bb1fbdd753b37adfb2736f14 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -265,5 +265,5 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
 
 USE_OP(mul);
 USE_OP(mul_grad);
-USE_OP(elementwise_add);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add);
+USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index bcd4e62e57c270c5af0e6f5632fdc5f4f803fb29..224b8228097c475bac5bb1c62d126699d975ae66 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -39,6 +39,8 @@ using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
 
 using var_pair = std::pair<std::string, vb_vector>;
 
+extern void TestSetForwardDataTypeOfGradVarsEager(
+    const NameVarMap<egr::EagerVariable>& outs);
 template <typename VarType>
 class TestRuntimeInferVarTypeContext
     : public RuntimeInferVarTypeContext<VarType> {
@@ -406,6 +408,11 @@ TEST(test_layer, test_inner_op_not_inited) {
   ASSERT_THROW(op.CheckAttrs(), platform::EnforceNotMet);
 }
 
+TEST(test_layer, test_eager) {
+  imperative::NameTensorMap ins = {};
+  TestSetForwardDataTypeOfGradVarsEager(ins);
+}
+
 }  // namespace imperative
 }  // namespace paddle
 
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index fa52aa6d0af61578e18d51e8b95c13b5d383c858..a440a1f486a0c75f299a7692b61b87d393780eb6 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -32,6 +32,9 @@ namespace framework = paddle::framework;
 namespace paddle {
 namespace imperative {
 
+extern void TestHandleComplexGradToRealGradEager(
+    const NameVarMap<egr::EagerVariable>& outs);
+
 static framework::VariableNameMap CreateVarNameMap(
     const framework::OpInfo& op_info, const std::string& op_type,
     const NameVarBaseMap& varbase_map, bool is_input) {
@@ -209,6 +212,11 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
   TestPrepareDataSamePlace({});
 }
 
+TEST(test_prepare_op, test_complex_eager) {
+  NameVarMap<egr::EagerVariable> outs = {};
+  TestHandleComplexGradToRealGradEager(outs);
+}
+
 #ifdef PADDLE_WITH_MKLDNN
 TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
   TestPrepareDataSamePlace({{"use_mkldnn", true}});
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index ff3331be56c3abe886496df95039c85073ed4777..ccce360269153ba2e8c6586b934f6a9bf6ace819 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -37,9 +37,10 @@ namespace paddle {
 namespace imperative {
 
 using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
-
 using var_pair = std::pair<std::string, vb_vector>;
 
+using ev_vector = std::vector<std::shared_ptr<egr::EagerVariable>>;
+using ev_pair = std::pair<std::string, ev_vector>;
 TEST(test_tracer, test_trace_op) {
   // Doing an mul
   imperative::Tracer tracer;
@@ -546,6 +547,44 @@ TEST(test_tracer, test_execution_context) {
   ASSERT_EQ(dy_ctx.OutputName("Out"), framework::kEmptyVarName);
 }
 
+TEST(test_tracer, eager_tracer) {
+  // Doing an mul
+  imperative::Tracer tracer;
+  std::shared_ptr<egr::EagerVariable> x_in(new egr::EagerVariable("x_in"));
+  std::shared_ptr<egr::EagerVariable> y_in(new egr::EagerVariable("y_in"));
+  std::shared_ptr<egr::EagerVariable> vout(new egr::EagerVariable("vout"));
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> dims1 = {2, 5};
+  std::vector<int64_t> dims2 = {5, 2};
+
+  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  x_in_tensor->Resize(framework::make_ddim(dims1));
+  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+  y_in_tensor->Resize(framework::make_ddim(dims2));
+  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+
+  ev_pair x_pair = ev_pair("X", ev_vector(1, x_in));
+  ev_pair y_pair = ev_pair("Y", ev_vector(1, y_in));
+  ev_pair out_pair = ev_pair("Out", ev_vector(1, vout));
+  imperative::NameTensorMap ins = {x_pair, y_pair};
+  imperative::NameTensorMap outs = {out_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+  tracer.TraceOp<egr::EagerVariable>("mul", ins, outs, mul_attr_map, place,
+                                     true);
+
+  const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
+  for (int i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
+    ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
+  }
+}
+
 }  // namespace imperative
 }  // namespace paddle
 
@@ -553,4 +592,4 @@ USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 81cd39c225b533d742d9eb399c8c87863a6572e5..a600720ef78edb5175bb7d17821f5d8e229d1a93 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use MLU device since it's not compiled with MLU,"
           "Please recompile or reinstall Paddle with MLU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use CustomDevice since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice "
+          "support."));
 #endif
     } else {
       PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -156,7 +168,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      const platform::Place& place, bool trace_backward,
                      const std::map<std::string, std::string>& inplace_map,
                      paddle::framework::AttributeMap* passed_default_attrs_,
-                     bool override_default_attr_map) {
+                     bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(type);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
@@ -222,9 +234,17 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with MLU if use MLUPlace."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      platform::DeviceManager::SetDevice(place);
+#else
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "PaddlePaddle should compile with CustomDevice if use "
+          "CustomPlace."));
 #endif
     }
-    if (!override_default_attr_map) {
+    if (!use_default_attr_map) {
       PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_,
                               paddle::platform::errors::PermissionDenied(
                                   "Detected default_attrs = nullptr."));
@@ -260,16 +280,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
   }
 
   if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
-    if (!override_default_attr_map) {
-      PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_,
-                              paddle::platform::errors::PermissionDenied(
-                                  "Detected default_attrs = nullptr."));
-      CreateGradOpNode(*op, new_ins, outs, attrs, *passed_default_attrs_, place,
-                       inplace_map);
-    } else {
-      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                       inplace_map);
-    }
+    PADDLE_ENFORCE_EQ(
+        passed_default_attrs_, nullptr,
+        paddle::platform::errors::PermissionDenied(
+            "We expect passed_default_attrs_ is nullptr while "
+            "use_default_attr_map is true, however we got not null "
+            "passed_default_attrs_. Please check your usage of trace_op. "));
+    CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
+                     inplace_map);
   } else {
     VLOG(3) << "No Grad to track for Op: " << type;
   }
@@ -281,16 +299,14 @@ template void Tracer::TraceOp<VarBase>(
     const NameVarMap<VarBase>& outs, framework::AttributeMap attrs,
     const platform::Place& place, bool trace_backward,
     const std::map<std::string, std::string>& inplace_map,
-    paddle::framework::AttributeMap* default_attrs,
-    bool override_default_attr_map);
+    paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map);
 
-template void Tracer::TraceOp<egr::EagerTensor>(
-    const std::string& type, const NameVarMap<egr::EagerTensor>& ins,
-    const NameVarMap<egr::EagerTensor>& outs, framework::AttributeMap attrs,
+template void Tracer::TraceOp<egr::EagerVariable>(
+    const std::string& type, const NameVarMap<egr::EagerVariable>& ins,
+    const NameVarMap<egr::EagerVariable>& outs, framework::AttributeMap attrs,
     const platform::Place& place, bool trace_backward,
     const std::map<std::string, std::string>& inplace_map_,
-    paddle::framework::AttributeMap* default_attrs,
-    bool override_default_attr_map);
+    paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map);
 
 void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const NameVarBaseMap& outs, framework::AttributeMap attrs,
@@ -304,13 +320,12 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      paddle::framework::AttributeMap attrs,
                      const paddle::platform::Place& place,
                      paddle::framework::AttributeMap* default_attrs,
-                     bool override_default_attr_map,
+                     bool use_default_attr_map,
                      const std::map<std::string, std::string>& inplace_map) {
-  VLOG(6) << "Running On Eager TraceOp with override_default_attr_map: "
-          << override_default_attr_map;
-  TraceOp<egr::EagerTensor>(type, ins, outs, std::move(attrs), place, false,
-                            inplace_map, default_attrs,
-                            override_default_attr_map);
+  VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
+          << use_default_attr_map;
+  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), place, false,
+                              inplace_map, default_attrs, use_default_attr_map);
 }
 
 void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
@@ -318,8 +333,9 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      paddle::framework::AttributeMap attrs,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp(less): ";
-  TraceOp<egr::EagerTensor>(type, ins, outs, std::move(attrs), expected_place_,
-                            false, inplace_map, nullptr, true);
+  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs),
+                              expected_place_, false, inplace_map, nullptr,
+                              true);
 }
 
 void Tracer::SetExpectedPlace(platform::Place place) {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 4e406a9482da0da456ad43046e48b97232dff885..3a9a1b630ce9cbc89f57b746e6e1e1445f6bd318 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -69,7 +69,7 @@ class Tracer {
                const platform::Place& place, bool trace_backward,
                const std::map<std::string, std::string>& inplace_map = {},
                paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
-               bool override_default_attr_map = true);
+               bool use_default_attr_map = true);
 
   void TraceOp(const std::string& type, const NameVarBaseMap& ins,
                const NameVarBaseMap& outs, framework::AttributeMap attrs,
@@ -83,7 +83,7 @@ class Tracer {
                const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
                const paddle::platform::Place& place,
                paddle::framework::AttributeMap* default_attrs,
-               bool override_default_attr_map,
+               bool use_default_attr_map,
                const std::map<std::string, std::string>& inplace_map = {});
 
   bool ComputeRequiredGrad(const NameVarBaseMap& ins,
diff --git a/paddle/fluid/imperative/var_helper.cc b/paddle/fluid/imperative/var_helper.cc
index 3548f2eeafd24126b50329246dd85f2f0e47878b..d97f7c1ee19b33e75b11d8f7541e638c93d152f0 100644
--- a/paddle/fluid/imperative/var_helper.cc
+++ b/paddle/fluid/imperative/var_helper.cc
@@ -95,8 +95,8 @@ template const paddle::platform::Place &GetPlace<VarBase>(
     const std::shared_ptr<VarBase> &var);
 template const paddle::platform::Place &GetPlace<VariableWrapper>(
     const std::shared_ptr<VariableWrapper> &var);
-template const paddle::platform::Place &GetPlace<egr::EagerTensor>(
-    const std::shared_ptr<egr::EagerTensor> &var);
+template const paddle::platform::Place &GetPlace<egr::EagerVariable>(
+    const std::shared_ptr<egr::EagerVariable> &var);
 
 /* GetNameFromVar */
 template <typename VarType>
@@ -104,8 +104,8 @@ const std::string &GetNameFromVar(std::shared_ptr<VarType> var) {
   return var->Name();
 }
 template <>
-const std::string &GetNameFromVar<egr::EagerTensor>(
-    std::shared_ptr<egr::EagerTensor> tensor) {
+const std::string &GetNameFromVar<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> tensor) {
   return tensor->name();
 }
 template const std::string &GetNameFromVar<VariableWrapper>(
@@ -120,8 +120,8 @@ void SetType(std::shared_ptr<VarType> var,
   var->SetType(type);
 }
 template <>
-void SetType<egr::EagerTensor>(std::shared_ptr<egr::EagerTensor> var,
-                               framework::proto::VarType::Type type) {
+void SetType<egr::EagerVariable>(std::shared_ptr<egr::EagerVariable> var,
+                                 framework::proto::VarType::Type type) {
   switch (type) {
     case paddle::framework::proto::VarType::LOD_TENSOR: {
       var->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
@@ -149,8 +149,8 @@ framework::proto::VarType::Type GetType(std::shared_ptr<VarType> var) {
   return var->Type();
 }
 template <>
-framework::proto::VarType::Type GetType<egr::EagerTensor>(
-    std::shared_ptr<egr::EagerTensor> var) {
+framework::proto::VarType::Type GetType<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> var) {
   if (var->Var().IsInitialized()) {
     return paddle::framework::ToVarType(var->Var().Type());
   } else {
@@ -168,8 +168,8 @@ framework::proto::VarType::Type GetDataType(std::shared_ptr<VarType> var) {
   return var->DataType();
 }
 template <>
-framework::proto::VarType::Type GetDataType<egr::EagerTensor>(
-    std::shared_ptr<egr::EagerTensor> var) {
+framework::proto::VarType::Type GetDataType<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> var) {
   if (var->Var().IsType<pten::SelectedRows>()) {
     return framework::TransToProtoVarType(
         var->Var().Get<pten::SelectedRows>().value().type());
@@ -197,8 +197,8 @@ bool CheckCachedKey(std::shared_ptr<VarType> var,
   return GetVariableWrapper(var)->hasCacheKey(key);
 }
 template <>
-bool CheckCachedKey<egr::EagerTensor>(
-    std::shared_ptr<egr::EagerTensor> tensor,
+bool CheckCachedKey<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> tensor,
     const paddle::framework::OpKernelType &key) {
   // TODO(jiabin): Support this later
   // VLOG(10) << "CheckCachedKey with tensor: " << tensor->name() << "and key is
@@ -219,7 +219,7 @@ std::shared_ptr<VariableWrapper> GetCachedValue(
 }
 template <>
 std::shared_ptr<VariableWrapper> GetCachedValue(
-    std::shared_ptr<egr::EagerTensor> var,
+    std::shared_ptr<egr::EagerVariable> var,
     const paddle::framework::OpKernelType &key) {
   // TODO(jiabin): Support this later
   //   PADDLE_THROW(platform::errors::Fatal("In eager mode program should not
@@ -243,10 +243,10 @@ void SetCachedValue(std::shared_ptr<VarType> var,
   GetVariableWrapper(var)->setCacheValue(key, GetVariableWrapper(res));
 }
 template <>
-void SetCachedValue<egr::EagerTensor>(
-    std::shared_ptr<egr::EagerTensor> tensor,
+void SetCachedValue<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> tensor,
     const paddle::framework::OpKernelType &key,
-    std::shared_ptr<egr::EagerTensor> res) {
+    std::shared_ptr<egr::EagerVariable> res) {
   //   PADDLE_THROW(platform::errors::Fatal("In eager mode program should not
   //   reach this, support cache and remove this error check later, or this
   //   should not be supported."));
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
index ff228e0ab84e2aec8d3d399bc1e5ba9cb14b42c2..cbcc1a9f99daaa16d0dfc5c79f610434dd4e33a5 100644
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
-class EagerTensor;
+class EagerVariable;
 }  // namespace egr
 namespace pten {
 class DenseTensor;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 904baebcb0be70b0d557a9431d1e8b969f0d74a2..e4fc52b6fa74427b1f24b194dffea6f39e2b4692 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -379,8 +379,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
 
   trt_engine->SetWithErnie(
-      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-      graph->Has(framework::ir::kMultiheadMatmulPass));
+      (graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+       graph->Has(framework::ir::kMultiheadMatmulPass)) ||
+      (graph->Has(framework::ir::kPrelnEmbEltwiseLayernormPass) &&
+       graph->Has(framework::ir::kMultiheadMatmulPass)));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index caac973d8b89a3ff1c605d81cb07bbdcb7a63304..7e4da57e9e7dfce3051d42183a8e89ebd04bd8f0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1470,6 +1470,8 @@ USE_TRT_CONVERTER(conv3d_transpose);
 USE_TRT_CONVERTER(mish);
 USE_TRT_CONVERTER(deformable_conv);
 USE_TRT_CONVERTER(pool3d)
+USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
+USE_TRT_CONVERTER(preln_skip_layernorm)
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 66b27b2903a70193f347d635ce7f863f8aa29b52..313e1f2faea553809cb6fce66ca9a751bace8d75 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -82,22 +82,24 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "quant_conv2d_dequant_fuse_pass",            //
       "delete_quant_dequant_op_pass",              //
       "delete_quant_dequant_filter_op_pass",       //
-      // "fc_fuse_pass",                                 //
-      "simplify_with_basic_ops_pass",           //
-      "embedding_eltwise_layernorm_fuse_pass",  //
-      "multihead_matmul_fuse_pass_v2",          //
-      "multihead_matmul_fuse_pass_v3",          //
-      "skip_layernorm_fuse_pass",               //
-      "conv_bn_fuse_pass",                      //
-      "unsqueeze2_eltwise_fuse_pass",           //
-      "trt_squeeze2_matmul_fuse_pass",          //
-      "trt_reshape2_matmul_fuse_pass",          //
-      "trt_flatten2_matmul_fuse_pass",          //
-      "trt_map_matmul_v2_to_mul_pass",          //
-      "trt_map_matmul_v2_to_matmul_pass",       //
-      "trt_map_matmul_to_mul_pass",             //
-      "fc_fuse_pass",                           //
-      "conv_elementwise_add_fuse_pass",         //
+      // "fc_fuse_pass",                        //
+      "simplify_with_basic_ops_pass",                 //
+      "embedding_eltwise_layernorm_fuse_pass",        //
+      "preln_embedding_eltwise_layernorm_fuse_pass",  //
+      "multihead_matmul_fuse_pass_v2",                //
+      "multihead_matmul_fuse_pass_v3",                //
+      "skip_layernorm_fuse_pass",                     //
+      "preln_skip_layernorm_fuse_pass",               //
+      "conv_bn_fuse_pass",                            //
+      "unsqueeze2_eltwise_fuse_pass",                 //
+      "trt_squeeze2_matmul_fuse_pass",                //
+      "trt_reshape2_matmul_fuse_pass",                //
+      "trt_flatten2_matmul_fuse_pass",                //
+      "trt_map_matmul_v2_to_mul_pass",                //
+      "trt_map_matmul_v2_to_matmul_pass",             //
+      "trt_map_matmul_to_mul_pass",                   //
+      "fc_fuse_pass",                                 //
+      "conv_elementwise_add_fuse_pass",               //
       "add_support_int8_pass",
       "tensorrt_subgraph_pass",  //
       "conv_bn_fuse_pass",       //
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 017caca6adc814af32d6045ce0510099c5935ed8..e91faedb06872a5abe38c1de77b54477e0da8ef4 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -21,6 +21,8 @@ nv_library(tensorrt_converter
                 nearest_interp_v2_op.cc
                 pool3d_op.cc
                 deformable_conv_op.cc
+                preln_emb_eltwise_layernorm.cc
+                preln_skip_layernorm.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 0436499cd40756150d5b33c6d685d74ffbe5b87d..3e326414825d09d8611d5c845975ef31cf5c83ce 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -43,30 +43,161 @@ class GeluOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert fluid gelu op to tensorrt gelu layer";
-
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
-    int input_num = op_desc.Input("X").size();
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::GeluPluginDynamic* plugin =
-          new plugin::GeluPluginDynamic(with_fp16);
-      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
+    if (op_desc.HasAttr("approximate") &&
+        BOOST_GET_CONST(bool, op_desc.GetAttr("approximate"))) {
+#if IS_TRT_VERSION_GE(7000)
+      nvinfer1::Dims input_shape;
+      input_shape.nbDims = input->getDimensions().nbDims;
+      for (int i = 0; i < input_shape.nbDims; ++i) {
+        input_shape.d[i] = 1;
+      }
+      std::string out_name = op_desc.Output("Out").front();
+      auto create_weights = [&](float data, std::string type) -> float* {
+        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        tmp_tensor->Resize({1});
+        auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+        tmp_data[0] = data;
+        engine_->SetWeights(out_name + "_gelu_op_" + type,
+                            std::move(tmp_tensor));
+        return tmp_data;
+      };
+      float* constant_pow = create_weights(3.0f, "constant_pow");
+      float* constant_multiply = create_weights(0.044715f, "constant_multiply");
+      float* constant_sqrt =
+          create_weights(0.79788456080286535587989211986876f, "constant_sqrt");
+      float* constant_one = create_weights(1.0f, "constant_one");
+      float* constant_half = create_weights(0.5f, "constant_half");
+      auto constant_layer_pow = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_pow), 1});
+      auto constant_layer_multiply = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_multiply), 1});
+      auto constant_layer_sqrt = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_sqrt), 1});
+      auto constant_layer_one = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_one), 1});
+      auto constant_layer_half = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_half), 1});
+      auto layer_pow = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *input, *constant_layer_pow->getOutput(0),
+          nvinfer1::ElementWiseOperation::kPOW);
+      auto layer_mul =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_pow->getOutput(0),
+                               *constant_layer_multiply->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kPROD);
+      auto layer_add =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_mul->getOutput(0),
+                               *input, nvinfer1::ElementWiseOperation::kSUM);
+      auto layer_sqrt =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_add->getOutput(0),
+                               *constant_layer_sqrt->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kPROD);
+      auto layer_tanh =
+          TRT_ENGINE_ADD_LAYER(engine_, Activation, *layer_sqrt->getOutput(0),
+                               nvinfer1::ActivationType::kTANH);
+      auto layer_one =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_tanh->getOutput(0),
+                               *constant_layer_one->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kSUM);
+      auto layer_CDF =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_one->getOutput(0),
+                               *constant_layer_half->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kPROD);
+      auto y =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_CDF->getOutput(0),
+                               *input, nvinfer1::ElementWiseOperation::kPROD);
+      layer = y;
 #else
       PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+          "You are running GeLU Op with approximate True, need to confirm that "
+          "your TRT version is no less than 7.0"));
 #endif
     } else {
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+#if IS_TRT_VERSION_GE(7000)
+      nvinfer1::Dims input_shape;
+      input_shape.nbDims = input->getDimensions().nbDims;
+      for (int i = 0; i < input_shape.nbDims; ++i) {
+        input_shape.d[i] = 1;
+      }
+      std::string out_name = op_desc.Output("Out").front();
+      auto create_weights = [&](float data, std::string type) -> float* {
+        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        tmp_tensor->Resize({1});
+        auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+        tmp_data[0] = data;
+        engine_->SetWeights(out_name + "_gelu_op_" + type,
+                            std::move(tmp_tensor));
+        return tmp_data;
+      };
+      float* constant_one = create_weights(1.0f, "constant_one");
+      float* constant_half = create_weights(0.5f, "constant_half");
+      float* constant_rsqrt2 =
+          create_weights(0.70710678118f, "constant_rsqrt2");
+      auto constant_layer_one = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_one), 1});
+      auto constant_layer_half = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_half), 1});
+      auto constant_layer_rsqrt2 = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, input_shape,
+          nvinfer1::Weights{nvinfer1::DataType::kFLOAT,
+                            static_cast<void*>(constant_rsqrt2), 1});
+      auto layer_mul = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *input, *constant_layer_rsqrt2->getOutput(0),
+          nvinfer1::ElementWiseOperation::kPROD);
+      auto layer_erf =
+          TRT_ENGINE_ADD_LAYER(engine_, Unary, *layer_mul->getOutput(0),
+                               nvinfer1::UnaryOperation::kERF);
+      auto layer_add =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_erf->getOutput(0),
+                               *constant_layer_one->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kSUM);
+      auto layer_CDF =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_add->getOutput(0),
+                               *constant_layer_half->getOutput(0),
+                               nvinfer1::ElementWiseOperation::kPROD);
+      auto y =
+          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_CDF->getOutput(0),
+                               *input, nvinfer1::ElementWiseOperation::kPROD);
+      layer = y;
+#else  // if IS_TRT_VERSION_GE(7000)
+      int input_num = op_desc.Input("X").size();
+      if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+        bool with_fp16 =
+            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+        plugin::GeluPluginDynamic* plugin =
+            new plugin::GeluPluginDynamic(with_fp16);
+        layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
+#else
+        PADDLE_THROW(platform::errors::Fatal(
+            "You are running the TRT Dynamic Shape mode, need to confirm that "
+            "your TRT version is no less than 6.0"));
+#endif
+      } else {
+        bool with_fp16 =
+            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+        plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16);
+        layer = engine_->AddPlugin(&input, input_num, plugin);
+      }
+#endif  // if IS_TRT_VERSION_GE(7000)
     }
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "gelu", {output_name}, test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 26d87e4832f5f194ca88be41596d34c3226b0390..fe04d552e40263a396059e3da59de4d51def67e0 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -106,6 +106,9 @@ class Pool2dOpConverter : public OpConverter {
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
     }
+    if (global_pooling || adaptive) {
+      std::fill(paddings.begin(), paddings.end(), 0);
+    }
 
     if (padding_algorithm == "VALID") {
       std::fill(paddings.begin(), paddings.end(), 0);
@@ -136,6 +139,46 @@ class Pool2dOpConverter : public OpConverter {
 #endif
     }
 
+    std::vector<int> real_paddings = paddings;
+    for (int i = 0; i < 2; ++i) {
+      int copy_pad = *(paddings.begin() + i);
+      real_paddings.insert(real_paddings.begin() + 2 * i + 1, copy_pad);
+    }
+    // SAME
+    if (padding_algorithm == "SAME") {
+      // expand
+      for (int i = 0; i < 2; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+      // compute
+      for (int i = 0; i < 2; ++i) {
+        int out_size = (input_shape.d[2 + i] + strides[i] - 1) / strides[i];
+        int pad_sum = std::max(
+            (out_size - 1) * strides[i] + ksize[i] - input_shape.d[2 + i], 0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        paddings[i * 2] = pad_0;
+        paddings[i * 2 + 1] = pad_1;
+      }
+      real_paddings = paddings;
+      // slice
+      for (int i = 0; i < 2; ++i) {
+        paddings.erase(paddings.begin() + i + 1);
+      }
+    }
+    // VALID
+    if (padding_algorithm == "VALID") {
+      std::fill(real_paddings.begin(), real_paddings.end(), 0);
+    }
+
+    if (global_pooling == true && !engine_->with_dynamic_shape()) {
+      nv_ksize.d[0] = input_shape.d[input_dims - 2];
+      nv_ksize.d[1] = input_shape.d[input_dims - 1];
+      ksize[0] = input_shape.d[input_dims - 2];
+      ksize[1] = input_shape.d[input_dims - 1];
+    }
+
     if (engine_->with_dynamic_shape()) {
       if (!adaptive && !global_pooling && !ceil_mode) {
         // input_shape.d < 0 means we can't get shape info here.
@@ -173,15 +216,15 @@ class Pool2dOpConverter : public OpConverter {
           pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
         }
         layer = pool_layer;
-      } else if (global_pooling) {
+      } else if (global_pooling && !adaptive) {
         auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
                                                   reduce_operation, 12, true);
         layer = reduce_layer;
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        plugin::PoolPluginDynamic *plugin =
-            new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize,
-                                          strides, paddings, global_pooling);
+        plugin::PoolPluginDynamic *plugin = new plugin::PoolPluginDynamic(
+            ceil_mode, pool_type, adaptive, exclusive, ksize, strides, paddings,
+            global_pooling);
         layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
 #endif
       }
@@ -195,21 +238,13 @@ class Pool2dOpConverter : public OpConverter {
       return;
     }
 
-    if (global_pooling == true) {
-      nv_ksize.d[0] = input_shape.d[input_dims - 2];
-      nv_ksize.d[1] = input_shape.d[input_dims - 1];
+    if (global_pooling == true && adaptive == false) {
       auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
                                               nv_pool_type, nv_ksize);
       PADDLE_ENFORCE_NOT_NULL(
           pool_layer, platform::errors::Fatal(
                           "trt pool layer in converter could not be created."));
       auto output_name = op_desc.Output("Out")[0];
-      pool_layer->setStride(nv_strides);
-      pool_layer->setPadding(nv_paddings);
-      if (padding_algorithm == "SAME") {
-        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
-      }
-      pool_layer->setAverageCountExcludesPadding(exclusive);
       pool_layer->setName(("pool2d (Output: " + output_name + ")").c_str());
       pool_layer->getOutput(0)->setName(output_name.c_str());
       engine_->SetITensor(output_name, pool_layer->getOutput(0));
@@ -222,58 +257,61 @@ class Pool2dOpConverter : public OpConverter {
 
     if (!adaptive) {
       if (ceil_mode) {
-        nvinfer1::DimsHW pre_pad(0, 0);
-        nvinfer1::DimsHW post_pad(0, 0);
-        // If ceil mode is true, we will pad the appropriate size to the input.
-        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
-                     input_dims);
-        auto *pad_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad);
-
+        std::vector<int> input_shape_v;
+        for (int i = 0; i < input_dims; i++) {
+          input_shape_v.push_back(input_shape.d[i]);
+        }
+        plugin::PoolPlugin *plugin = new plugin::PoolPlugin(
+            ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides,
+            paddings, input_shape_v, real_paddings);
+        auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin);
         PADDLE_ENFORCE_NOT_NULL(
-            pad_layer, platform::errors::Fatal(
-                           "Pad layer in poolOp converter could not be "
-                           "created. The pointer to pad layer is `NULL`."));
-        input1 = pad_layer->getOutput(0);
-      }
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool plugin layer in converter could not be created."));
+        layer = pool_layer;
+      } else {
 #if IS_TRT_VERSION_GE(8000)
-      // Exclude padding pixels from the average mean is not supported well by
-      // TRT
-      // so enable padding for trt8.0 above.
-      if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) &&
-          (padding_algorithm != "SAME") && !ceil_mode) {
-        auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1,
-                                               g_pre_pad, g_post_pad);
-        PADDLE_ENFORCE_NOT_NULL(
-            pad_layer, platform::errors::Fatal(
-                           "Pad layer in poolOp converter could not be "
-                           "created. The pointer to pad layer is `NULL`."));
-        input1 = pad_layer->getOutput(0);
-      }
+        // Exclude padding pixels from the average mean is not supported well by
+        // TRT
+        // so enable padding for trt8.0 above.
+        if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) &&
+            (padding_algorithm != "SAME") && !ceil_mode) {
+          auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1,
+                                                 g_pre_pad, g_post_pad);
+          PADDLE_ENFORCE_NOT_NULL(
+              pad_layer, platform::errors::Fatal(
+                             "Pad layer in poolOp converter could not be "
+                             "created. The pointer to pad layer is `NULL`."));
+          input1 = pad_layer->getOutput(0);
+        }
 #endif
-      auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
-                                              nv_pool_type, nv_ksize);
-      PADDLE_ENFORCE_NOT_NULL(
-          pool_layer, platform::errors::Fatal(
-                          "trt pool layer in converter could not be created."));
-      pool_layer->setStride(nv_strides);
-      pool_layer->setPadding(nv_paddings);
-      if (padding_algorithm == "SAME") {
-        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
+                                                nv_pool_type, nv_ksize);
+        PADDLE_ENFORCE_NOT_NULL(
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool layer in converter could not be created."));
+        pool_layer->setStride(nv_strides);
+        pool_layer->setPadding(nv_paddings);
+        if (padding_algorithm == "SAME") {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        }
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        layer = pool_layer;
       }
-      pool_layer->setAverageCountExcludesPadding(exclusive);
-      layer = pool_layer;
+
     } else {
       // Average pooling needs to exclude the padding pixels from the average
       // mean.
-      // It is not supported well by TRT, we use a plugin here.
+      // It is not supported well by TRT, we use a plugin here
       std::vector<int> input_shape_v;
       for (int i = 0; i < input_dims; i++) {
         input_shape_v.push_back(input_shape.d[i]);
       }
-      plugin::PoolPlugin *plugin =
-          new plugin::PoolPlugin(ceil_mode, plugin_pool_type, adaptive, ksize,
-                                 strides, paddings, input_shape_v);
+      plugin::PoolPlugin *plugin = new plugin::PoolPlugin(
+          ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides,
+          paddings, input_shape_v, real_paddings);
       auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin);
       PADDLE_ENFORCE_NOT_NULL(
           pool_layer,
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50f90de85fd0494110b86dde743428a6b1844b57
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -0,0 +1,223 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(7000)
+    VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer";
+
+    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PrelnErnie: If you want to use oss, must be with interleaved"));
+    }
+    framework::OpDesc op_desc(op, nullptr);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    if (!enable_int8) {
+      PADDLE_THROW(
+          platform::errors::Fatal("use with_interleaved must be int8."));
+    }
+    auto word_id_name = op_desc.Input("WordId").front();
+    auto pos_id_name = op_desc.Input("PosId").front();
+    engine_->Set("ernie_pos_name", new std::string(pos_id_name));
+
+    auto sent_id_name = op_desc.Input("SentId").front();
+    auto word_emb_name = op_desc.Input("WordEmbedding").front();
+    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
+    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
+
+    std::vector<std::string> id_names;
+    std::vector<std::string> emb_names;
+
+    id_names =
+        std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+    emb_names =
+        std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+
+    int input_num = id_names.size();
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> input_ids;
+    for (int i = 0; i < input_num; i++) {
+      input_ids.push_back(engine_->GetITensor(id_names[i]));
+    }
+
+    // input_embs[0]: word_embedding
+    // input_embs[1]: pos_embedding
+    // input_embs[2]: sent_embedding
+    std::vector<float*> input_embs;
+    std::vector<int> emb_sizes;
+
+    // get the presistable var's data
+    auto get_persistable_data = [&](const std::string& var_name,
+                                    framework::DDim* dims) -> float* {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    for (int i = 0; i < input_num; i++) {
+      framework::DDim emb_dims;
+      float* emb_data = get_persistable_data(emb_names[i], &emb_dims);
+      int64_t emb_size = framework::product(emb_dims);
+      input_embs.push_back(emb_data);
+      emb_sizes.push_back(emb_size);
+      PADDLE_ENFORCE_EQ(
+          emb_dims.size(), 2,
+          platform::errors::InvalidArgument(
+              "The fused PrelnEmbEltwiseLayerNorm's emb should be 2 dims."));
+    }
+
+    framework::DDim bias_dims, scale_dims;
+
+    auto* bias =
+        get_persistable_data(op_desc.Input("Bias").front(), &bias_dims);
+    auto* scale =
+        get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
+    int64_t bias_size = framework::product(bias_dims);
+    int64_t scale_size = framework::product(scale_dims);
+    int output_int8 = 1;
+
+    PADDLE_ENFORCE_EQ(
+        input_num, 3,
+        platform::errors::InvalidArgument(
+            "When using oss and var-len, embedding_eltwise_layernorm op"
+            "should have 3 inputs only, but got %d.",
+            input_num));
+    const std::vector<nvinfer1::PluginField> fields{
+        {"bert_embeddings_layernorm_beta", bias,
+         nvinfer1::PluginFieldType::kFLOAT32, static_cast<int32_t>(bias_size)},
+        {"bert_embeddings_layernorm_gamma", scale,
+         nvinfer1::PluginFieldType::kFLOAT32, static_cast<int32_t>(scale_size)},
+        {"bert_embeddings_word_embeddings", input_embs[0],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[0])},
+        {"bert_embeddings_token_type_embeddings", input_embs[2],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[2])},
+        {"bert_embeddings_position_embeddings", input_embs[1],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[1])},
+        {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+
+    nvinfer1::PluginFieldCollection* plugin_ptr =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_ptr) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_ptr->nbFields = static_cast<int>(fields.size());
+    plugin_ptr->fields = fields.data();
+
+    std::vector<nvinfer1::ITensor*> plugin_inputs;
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(word_id_name));  // word_embedding,
+                                             // eval_placeholder_0
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(sent_id_name));  // sent_embedding,
+                                             // eval_placeholder_1
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(pos_id_name));  // cu_seqlens,
+                                            // eval_placeholder_2
+    auto max_seqlen_tensor =
+        engine_->GetITensor(engine_->network()->getInput(3)->getName());
+    auto* shuffle_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+    nvinfer1::Dims shape_dim;
+    shape_dim.nbDims = 1;
+    shape_dim.d[0] = -1;
+    shuffle_layer->setReshapeDimensions(shape_dim);
+    shuffle_layer->setName(
+        ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
+         op_desc.Output("Out")[0] + ")")
+            .c_str());
+    engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+    plugin_inputs.emplace_back(
+        shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+    auto creator = GetPluginRegistry()->getPluginCreator(
+        "CustomEmbLayerNormPluginDynamic", "3");
+
+    auto plugin_obj =
+        creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+    auto plugin_layer = engine_->network()->addPluginV2(
+        plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+    plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " +
+                           op_desc.Output("Out")[0] + ")")
+                              .c_str());
+    free(plugin_ptr);
+    float out_0_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("out_0_threshold"));
+    float out_1_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("out_1_threshold"));
+    engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_0_scale);
+    engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_1_scale);
+
+    auto* shuffler_embed_out0 =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(0)));
+    nvinfer1::Permutation transpose_0{2, 1, 0, 3};
+    shuffler_embed_out0->setSecondTranspose(transpose_0);
+    shuffler_embed_out0->getOutput(0)->setName(
+        op_desc.Output("Out_0")[0].c_str());
+    engine_->SetITensor(op_desc.Output("Out_0")[0],
+                        shuffler_embed_out0->getOutput(0));
+    shuffler_embed_out0->setName(
+        ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_0: " +
+         op_desc.Output("Out_0")[0] + ")")
+            .c_str());
+
+    auto* shuffler_embed_out1 =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(1)));
+    nvinfer1::Permutation transpose_1{2, 1, 0, 3};
+    shuffler_embed_out1->setSecondTranspose(transpose_1);
+    shuffler_embed_out1->getOutput(0)->setName(
+        op_desc.Output("Out_1")[0].c_str());
+
+    engine_->SetITensor(op_desc.Output("Out_1")[0],
+                        shuffler_embed_out1->getOutput(0));
+    shuffler_embed_out1->setName(
+        ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_1: " +
+         op_desc.Output("Out_1")[0] + ")")
+            .c_str());
+
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "PreInErnie want to use oss, must be with interleaved, "
+        "your TRT version is no less than 7.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fused_preln_embedding_eltwise_layernorm,
+                          PrelnEmbEltwiseLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa0d6fbe81376ed61992dbc6c15c69145aa98a4d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PrelnSkipLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(7000)
+    VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer";
+    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PrelnErnie: If you want to use oss, must be with interleaved"));
+    }
+    framework::OpDesc op_desc(op, nullptr);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    if (!enable_int8) {
+      PADDLE_THROW(
+          platform::errors::Fatal("use with_interleaved must be int8."));
+    }
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    std::vector<nvinfer1::ITensor*> inputs;
+    inputs.push_back(input1);
+    inputs.push_back(input2);
+
+    auto get_persistable_data = [&](const std::string& arg_name,
+                                    framework::DDim* dims) -> float* {
+      std::string var_name = op_desc.Input(arg_name).front();
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    framework::DDim bias_dims, scale_dims;
+    auto* bias = get_persistable_data("Bias", &bias_dims);
+    auto* scale = get_persistable_data("Scale", &scale_dims);
+    int bias_size = framework::product(bias_dims);
+    int scale_size = framework::product(scale_dims);
+
+    nvinfer1::ILayer* layer = nullptr;
+
+    VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved";
+
+    auto creator = GetPluginRegistry()->getPluginCreator(
+        "CustomSkipLayerNormPluginDynamic", "4");
+    PADDLE_ENFORCE_NE(
+        creator, nullptr,
+        platform::errors::InvalidArgument(
+            "fail to get creator of CustomPrelnSkipLayerNormPluginDynamic"));
+    const std::vector<nvinfer1::PluginField> fields{
+        {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+        { "gamma",
+          scale,
+          nvinfer1::PluginFieldType::kFLOAT32,
+          scale_size }};
+    nvinfer1::PluginFieldCollection* pluginPtr =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*pluginPtr) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    pluginPtr->nbFields = static_cast<int>(fields.size());
+    pluginPtr->fields = fields.data();
+
+    auto pluginObj =
+        creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+    auto plugin_layer = engine_->network()->addPluginV2(
+        inputs.data(), inputs.size(), *pluginObj);
+
+    PADDLE_ENFORCE_NE(
+        plugin_layer, nullptr,
+        platform::errors::InvalidArgument(
+            "fail to add CustomPrelnSkipLayerNormPluginDynamic layer"));
+    layer = plugin_layer;
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name},
+                             test_mode);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "PreInErnie want to use oss, must be with interleaved, "
+        "your TRT version is no less than 7.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(preln_skip_layernorm, PrelnSkipLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
index 17adf957f64a76a010da6160479be2125d9deac9..d14317712b579b8f04889c3a18e4231d96513225 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -103,5 +103,5 @@ TEST(elementwise_op, plugin) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(elementwise_mul);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 767672007dfef67ecc4424fa6c962832599b0182..799c6c55bb121778cfe3b1a39f2dc1af315236dd 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -30,24 +30,6 @@ namespace tensorrt {
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
   SimpleOpTypeSetTeller() {
-#if IS_TRT_VERSION_GE(5130)
-    teller_set.insert("relu6");
-    teller_set.insert("hard_sigmoid");
-    teller_set.insert("clip");
-    int8_teller_set.insert("relu6");
-    int8_teller_set.insert("hard_sigmoid");
-    int8_teller_set.insert("clip");
-#endif
-#if IS_TRT_VERSION_GE(6000)
-    teller_set.insert("fused_embedding_eltwise_layernorm");
-    teller_set.insert("multihead_matmul");
-    teller_set.insert("skip_layernorm");
-    teller_set.insert("slice");
-    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
-    int8_teller_set.insert("multihead_matmul");
-    int8_teller_set.insert("skip_layernorm");
-    int8_teller_set.insert("slice");
-#endif
 // TODO(baoachun) The group_norm trt plugin will check input's dim
 // not -1 failed when dynamic shape mode.
 // #if IS_TRT_VERSION_GE(7130)
@@ -76,104 +58,124 @@ struct SimpleOpTypeSetTeller : public Teller {
 
  private:
   // use this set for no calib int8.
-  std::unordered_set<std::string> int8_teller_set{"mul",
-                                                  "matmul",
-                                                  "conv2d",
-                                                  "conv2d_fusion",
-                                                  "pool2d",
-                                                  "relu",
-                                                  "softmax",
-                                                  "sigmoid",
-                                                  "hard_swish",
-                                                  "depthwise_conv2d",
-                                                  "batch_norm",
-                                                  "concat",
-                                                  "tanh",
-                                                  "pad",
-                                                  "elementwise_add",
-                                                  "elementwise_mul",
-                                                  "dropout",
-                                                  "prelu",
-                                                  "conv2d_transpose",
-                                                  "depthwise_conv2d_transpose",
-                                                  "leaky_relu",
-                                                  "fc",
-                                                  "shuffle_channel",
-                                                  "swish",
-                                                  "split",
-                                                  "instance_norm",
-                                                  "gelu",
-                                                  "layer_norm",
-                                                  "scale",
-                                                  "stack",
-                                                  "transpose2",
-                                                  "transpose",
-                                                  "flatten2",
-                                                  "flatten",
-                                                  "gather",
-                                                  "gather_nd",
-                                                  "yolo_box",
-                                                  "roi_align",
-                                                  "affine_channel",
-                                                  "nearest_interp",
-                                                  "anchor_generator",
-                                                  "reduce_sum",
-                                                  "reduce_mean",
-                                                  "conv3d",
-                                                  "conv3d_transpose",
-                                                  "mish",
-                                                  "nearest_interp_v2",
-                                                  "pool3d",
-                                                  "deformable_conv"};
-  std::unordered_set<std::string> teller_set{"mul",
-                                             "matmul",
-                                             "conv2d",
-                                             "conv2d_fusion",
-                                             "pool2d",
-                                             "relu",
-                                             "softmax",
-                                             "sigmoid",
-                                             "hard_swish",
-                                             "depthwise_conv2d",
-                                             "batch_norm",
-                                             "concat",
-                                             "tanh",
-                                             "pad",
-                                             "elementwise_add",
-                                             "elementwise_mul",
-                                             "dropout",
-                                             "prelu",
-                                             "conv2d_transpose",
-                                             "depthwise_conv2d_transpose",
-                                             "leaky_relu",
-                                             "fc",
-                                             "shuffle_channel",
-                                             "swish",
-                                             "split",
-                                             "instance_norm",
-                                             "gelu",
-                                             "layer_norm",
-                                             "scale",
-                                             "stack",
-                                             "transpose2",
-                                             "transpose",
-                                             "flatten2",
-                                             "flatten",
-                                             "gather",
-                                             "gather_nd",
-                                             "yolo_box",
-                                             "roi_align",
-                                             "affine_channel",
-                                             "nearest_interp",
-                                             "anchor_generator",
-                                             "reduce_sum",
-                                             "reduce_mean",
-                                             "conv3d",
-                                             "conv3d_transpose",
-                                             "mish",
-                                             "nearest_interp_v2",
-                                             "pool3d",
-                                             "deformable_conv"};
+  std::unordered_set<std::string> int8_teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm"};
+  std::unordered_set<std::string> teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
@@ -1007,6 +1009,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "fused_preln_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "fused_preln_embedding_eltwise_layernorm should run on dynamic "
+               "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused PrelnEmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+      if (!desc.HasAttr("enable_int8")) {
+        VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode.";
+        return false;
+      }
+    }
+
     if (op_type == "gelu") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "gelu op has only 1 input, but got "
@@ -1019,9 +1039,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
 
+#if IS_TRT_VERSION_LT(7000)
       if (desc.HasAttr("approximate")) {
+        VLOG(3) << "approximate gelu op needs TensorRT 7.0 and after";
         if (BOOST_GET_CONST(bool, desc.GetAttr("approximate"))) return false;
       }
+#endif
 
       auto* block = desc.Block();
       if (block == nullptr) {
@@ -1030,6 +1053,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                    "the pass.";
         return false;
       }
+
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -1312,6 +1336,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "preln_skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the preln_skip_layernorm does not support static shape yet";
+        return false;
+      }
+      if (!desc.HasAttr("enable_int8")) {
+        VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode.";
+        return false;
+      }
+    }
+
     if (op_type == "multihead_matmul") {
       if (!with_dynamic_shape) {
         VLOG(3) << "the multihead_matmul does not support static shape yet";
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 21c8812f3789e37a68ba75be68c296a8bc214511..6d711c26adc6ff8e49375d15f32322303f3ae6ef 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -35,6 +35,36 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index,
   return output_dims;
 }
 
+size_t PoolPlugin::getSerializationSize() const TRT_NOEXCEPT {
+  return getBaseSerializationSize() + SerializedSize(ceil_mode_) +
+         SerializedSize(pool_type_) + SerializedSize(adaptive_) +
+         SerializedSize(exclusive_) + SerializedSize(ksize_) +
+         SerializedSize(strides_) + SerializedSize(paddings_) +
+         SerializedSize(real_paddings_) + SerializedSize(input_shape_) +
+         SerializedSize(output_shape_);
+}
+
+// TRT will call this func when we need to serialize the configuration of
+// tensorrt.
+void PoolPlugin::serialize(void *buffer) const TRT_NOEXCEPT {
+  serializeBase(buffer);
+  SerializeValue(&buffer, ceil_mode_);
+  SerializeValue(&buffer, pool_type_);
+  SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, exclusive_);
+  SerializeValue(&buffer, ksize_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, real_paddings_);
+  SerializeValue(&buffer, input_shape_);
+  SerializeValue(&buffer, output_shape_);
+}
+
+PoolPlugin *PoolPlugin::clone() const TRT_NOEXCEPT {
+  return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, exclusive_, ksize_,
+                        strides_, paddings_, input_shape_, real_paddings_);
+}
+
 int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
 #if IS_TRT_VERSION_LT(8000)
                         void **outputs, void *workspace,
@@ -59,14 +89,15 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
         paddle::operators::math::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
-                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+                   paddings_, true, false, odatas[0], stream, pool_process);
   } else if (pool_type_ == PoolType::avg) {
     paddle::operators::math::AvgPool<float> pool_process;
     paddle::operators::math::Pool2dDirectCUDAFunctor<
         paddle::operators::math::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
-                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+                   paddings_, exclusive_, adaptive_, odatas[0], stream,
+                   pool_process);
   }
 
   return cudaGetLastError() != cudaSuccess;
@@ -82,6 +113,7 @@ PoolPluginDynamic::PoolPluginDynamic(void const *serialData,
   DeserializeValue(&serialData, &serialLength, &pool_type);
   pool_type_ = std::string(pool_type);
   DeserializeValue(&serialData, &serialLength, &adaptive_);
+  DeserializeValue(&serialData, &serialLength, &exclusive_);
   DeserializeValue(&serialData, &serialLength, &ksize_);
   DeserializeValue(&serialData, &serialLength, &strides_);
   DeserializeValue(&serialData, &serialLength, &paddings_);
@@ -90,21 +122,27 @@ PoolPluginDynamic::PoolPluginDynamic(void const *serialData,
 
 size_t PoolPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(ceil_mode_) + SerializedSize(pool_type_.c_str()) +
-         SerializedSize(adaptive_) + SerializedSize(ksize_) +
-         SerializedSize(strides_) + SerializedSize(paddings_) +
-         SerializedSize(is_global_);
+         SerializedSize(adaptive_) + SerializedSize(exclusive_) +
+         SerializedSize(ksize_) + SerializedSize(strides_) +
+         SerializedSize(paddings_) + SerializedSize(is_global_);
 }
 
 void PoolPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, ceil_mode_);
   SerializeValue(&buffer, pool_type_.c_str());
   SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, exclusive_);
   SerializeValue(&buffer, ksize_);
   SerializeValue(&buffer, strides_);
   SerializeValue(&buffer, paddings_);
   SerializeValue(&buffer, is_global_);
 }
 
+nvinfer1::IPluginV2DynamicExt *PoolPluginDynamic::clone() const TRT_NOEXCEPT {
+  return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, exclusive_,
+                               ksize_, strides_, paddings_, is_global_);
+}
+
 nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
@@ -117,11 +155,14 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
       platform::errors::InvalidArgument("The channel dimension should be "
                                         "static, but we found it's dynamic."));
   nvinfer1::DimsExprs output(inputs[0]);
-  if (is_global_) {
+  if (is_global_ && !adaptive_) {
     output.d[2] = expr_builder.constant(1);
     output.d[3] = expr_builder.constant(1);
     return output;
   }
+  if (is_global_ && adaptive_) {
+    return inputs[0];
+  }
   if (adaptive_) {
     output.d[2] = expr_builder.constant(ksize_[0]);
     output.d[3] = expr_builder.constant(ksize_[1]);
@@ -245,6 +286,10 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     output_shape[2] = data_dim[0];
     output_shape[3] = data_dim[1];
   }
+  if (adaptive_) {
+    output_shape[2] = h;
+    output_shape[3] = w;
+  }
 
   if (pool_type_ == "max") {
     paddle::operators::math::MaxPool<float> pool_process;
@@ -252,14 +297,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
         paddle::operators::math::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
-                   true, adaptive_, output, stream, pool_process);
+                   true, false, output, stream, pool_process);
   } else if (pool_type_ == "avg") {
     paddle::operators::math::AvgPool<float> pool_process;
     paddle::operators::math::Pool2dDirectCUDAFunctor<
         paddle::operators::math::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
-                   true, adaptive_, output, stream, pool_process);
+                   exclusive_, adaptive_, output, stream, pool_process);
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
index 6ced066a35952f5046ca9f3dd5fb83d860086001..d1bf2cd02e84f3cff3f61702160fcfa7e53f023f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
@@ -29,26 +29,32 @@ static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
                                        const bool& adaptive,
                                        const std::vector<int>& ksize,
                                        const std::vector<int>& strides,
-                                       const std::vector<int>& paddings) {
+                                       const std::vector<int>& real_paddings) {
   std::vector<int> output_shape = input_shape;
   if (adaptive) {
     output_shape[0] = ksize[0];
     output_shape[1] = ksize[1];
   } else {
-    int output_h, output_w;
-    if (!ceil_mode) {
-      output_h = (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-      output_w = (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-    } else {
-      output_h =
-          (input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
-              strides[0] +
-          1;
-      output_w =
-          (input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
-              strides[1] +
-          1;
+    int output_h = 0, output_w = 0;
+    if (ceil_mode) {
+      output_h = (input_shape[0] - ksize[0] + real_paddings[0] +
+                  real_paddings[1] + strides[0] - 1) /
+                     strides[0] +
+                 1;
+      output_w = (input_shape[1] - ksize[1] + real_paddings[2] +
+                  real_paddings[3] + strides[1] - 1) /
+                     strides[1] +
+                 1;
     }
+    // TRT will use native layer when ceil_model=false
+    /*
+    else{
+      output_h = (input_shape[0] - ksize[0] + real_paddings[0] +
+    real_paddings[1]) / strides[0] + 1;
+      output_w = (input_shape[1] - ksize[1] + real_paddings[2] +
+    real_paddings[3]) / strides[1] + 1;
+    }
+    */
     output_shape[0] = output_h;
     output_shape[1] = output_w;
   }
@@ -57,47 +63,32 @@ static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
 
 class PoolPlugin : public PluginTensorRT {
  public:
-  size_t getSerializationSize() const TRT_NOEXCEPT override {
-    return getBaseSerializationSize() + SerializedSize(ceil_mode_) +
-           SerializedSize(pool_type_) + SerializedSize(adaptive_) +
-           SerializedSize(ksize_) + SerializedSize(strides_) +
-           SerializedSize(paddings_) + SerializedSize(input_shape_) +
-           SerializedSize(output_shape_);
-  }
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
 
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  void serialize(void* buffer) const TRT_NOEXCEPT override {
-    serializeBase(buffer);
-    SerializeValue(&buffer, ceil_mode_);
-    SerializeValue(&buffer, pool_type_);
-    SerializeValue(&buffer, adaptive_);
-    SerializeValue(&buffer, ksize_);
-    SerializeValue(&buffer, strides_);
-    SerializeValue(&buffer, paddings_);
-    SerializeValue(&buffer, input_shape_);
-    SerializeValue(&buffer, output_shape_);
-  }
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   enum class PoolType {
     max = 0,
     avg,
   };
   PoolPlugin() {}
-  PoolPlugin(bool ceil_mode, PoolType pool_type, bool adaptive,
+  PoolPlugin(bool ceil_mode, PoolType pool_type, bool adaptive, bool exclusive,
              std::vector<int> ksize, std::vector<int> strides,
-             std::vector<int> paddings, std::vector<int> input_shape)
+             std::vector<int> paddings, std::vector<int> input_shape,
+             std::vector<int> real_paddings)
       : ceil_mode_(ceil_mode),
         pool_type_(pool_type),
         adaptive_(adaptive),
+        exclusive_(exclusive),
         ksize_(ksize),
         strides_(strides),
         paddings_(paddings),
+        real_paddings_(real_paddings),
         input_shape_(input_shape) {
     output_shape_ = input_shape_;
     std::vector<int> output_shape =
         CalcOutputSize({input_shape_[1], input_shape_[2]}, ceil_mode_,
-                       adaptive_, ksize_, strides_, paddings_);
+                       adaptive_, ksize_, strides_, real_paddings_);
     output_shape_[1] = output_shape[0];
     output_shape_[2] = output_shape[1];
   }
@@ -109,17 +100,16 @@ class PoolPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &ceil_mode_);
     DeserializeValue(&serialData, &serialLength, &pool_type_);
     DeserializeValue(&serialData, &serialLength, &adaptive_);
+    DeserializeValue(&serialData, &serialLength, &exclusive_);
     DeserializeValue(&serialData, &serialLength, &ksize_);
     DeserializeValue(&serialData, &serialLength, &strides_);
     DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &real_paddings_);
     DeserializeValue(&serialData, &serialLength, &input_shape_);
     DeserializeValue(&serialData, &serialLength, &output_shape_);
   }
 
-  PoolPlugin* clone() const TRT_NOEXCEPT override {
-    return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, ksize_, strides_,
-                          paddings_, input_shape_);
-  }
+  PoolPlugin* clone() const TRT_NOEXCEPT override;
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "pool_plugin";
@@ -139,9 +129,11 @@ class PoolPlugin : public PluginTensorRT {
   bool ceil_mode_;
   PoolType pool_type_;
   bool adaptive_;
+  bool exclusive_;
   std::vector<int> ksize_;
   std::vector<int> strides_;
   std::vector<int> paddings_;
+  std::vector<int> real_paddings_;
   std::vector<int> input_shape_;
   std::vector<int> output_shape_;
 };
@@ -167,12 +159,14 @@ class PoolPluginDynamic : public DynamicPluginTensorRT {
  public:
   PoolPluginDynamic() {}
   PoolPluginDynamic(const bool& ceil_mode, const std::string& pool_type,
-                    const bool& adaptive, const std::vector<int>& ksize,
+                    const bool& adaptive, bool exclusive,
+                    const std::vector<int>& ksize,
                     const std::vector<int>& strides,
                     const std::vector<int>& paddings, const bool& is_global)
       : ceil_mode_(ceil_mode),
         pool_type_(pool_type),
         adaptive_(adaptive),
+        exclusive_(exclusive),
         ksize_(ksize),
         strides_(strides),
         paddings_(paddings),
@@ -180,10 +174,7 @@ class PoolPluginDynamic : public DynamicPluginTensorRT {
 
   PoolPluginDynamic(void const* serialData, size_t serialLength);
   ~PoolPluginDynamic() {}
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
-    return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, ksize_,
-                                 strides_, paddings_, is_global_);
-  }
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "pool_plugin_dynamic";
@@ -229,6 +220,7 @@ class PoolPluginDynamic : public DynamicPluginTensorRT {
   bool ceil_mode_;
   std::string pool_type_;
   bool adaptive_;
+  bool exclusive_;
   std::vector<int> ksize_;
   std::vector<int> strides_;
   std::vector<int> paddings_;
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index b899ddbcd5a4e30e065eb1969c41fde6046a8ea7..6cd7d87332323f4bafd49b8b16254f9610405658 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -58,6 +58,11 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
+if (WITH_CUSTOM_DEVICE)
+  cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
+  set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
+endif()
+
 if (WITH_GPU)
     nv_test(best_fit_allocator_test
             SRCS best_fit_allocator_test.cc
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 35131446d8647e0581d2d997451017293b7ca8dc..fc34a64d62636cca3d274fb2294a5d9139ae5d77 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -62,6 +62,11 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
@@ -186,6 +191,17 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
           InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitNaiveBestFitCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id));
+          }
+        }
 #endif
         break;
       }
@@ -222,6 +238,17 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
           InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitAutoGrowthCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
+          }
+        }
 #endif
         break;
       }
@@ -700,6 +727,21 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+
+  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
+                                           bool allow_free_idle_chunk) {
+    auto custom_allocator =
+        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        allow_free_idle_chunk);
+  }
+#endif
+
   void InitSystemAllocators() {
     if (!system_allocators_.empty()) return;
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
@@ -770,6 +812,16 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::MLUPlace(dev_id));
     }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    for (const auto& dev_type : device_types) {
+      for (size_t dev_id = 0;
+           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+           dev_id++) {
+        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
+      }
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
         "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
   }
 #endif
-
   platform::CUDAPlace p(place.GetDeviceId());
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb035ea5e3ad409777114cca44cd945ed4bd9541
--- /dev/null
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool CustomAllocator::IsAllocThreadSafe() const { return true; }
+void CustomAllocator::FreeImpl(pten::Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      allocation->place(), place_,
+      platform::errors::PermissionDenied("CustomDevice memory is "
+                                         "freed in incorrect device. "
+                                         "This may be a bug"));
+
+  delete allocation;
+}
+
+pten::Allocation* CustomAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::DeviceManager::SetDevice(place_); });
+
+  void* ptr =
+      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+  if (LIKELY(ptr)) {
+    return new Allocation(ptr, size, place_);
+  }
+
+  size_t avail, total;
+  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+
+  auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
+  auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on %s:%d. "
+      "Cannot allocate %s memory on %s:%d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using %s:%d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another %s.\n"
+      "2. If no, please decrease the batch size of your model.\n\n",
+      dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id,
+      string::HumanReadableSize(avail), dev_type, dev_id, dev_type));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..708c105a850087f49becde702590920a0f9afc9d
--- /dev/null
+++ b/paddle/fluid/memory/allocation/custom_allocator.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CustomAllocator : public Allocator {
+ public:
+  explicit CustomAllocator(const platform::CustomPlace& place)
+      : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::Place place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 91358b688040aa9789e3268eb0e29dc6790c0e13..b63f872141c802f512332750d36a3116df2c40c9 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,6 +20,7 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -30,7 +31,6 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#include "paddle/fluid/platform/device/device_wrapper.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     init_allocated_mem, false,
@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
 #endif
 }
 
+// For CustomDevice
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class BuddyAllocatorList {
+ private:
+  explicit BuddyAllocatorList(const std::string &device_type)
+      : device_type_(device_type) {
+    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    for (auto dev_id : devices) {
+      init_flags_[dev_id].reset(new std::once_flag());
+    }
+  }
+
+  static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
+    return new BuddyAllocatorList(device_type);
+  }
+
+ public:
+  static BuddyAllocatorList *Instance(const std::string &device_type) {
+    // DeviceType -> AllocatorList
+    static std::unordered_map<std::string, BuddyAllocatorList *> pool;
+    if (pool.find(device_type) == pool.end()) {
+      pool[device_type] = CreateNewInstance(device_type);
+    }
+    return pool[device_type];
+  }
+
+  BuddyAllocator *Get(int dev_id) {
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+                      platform::errors::OutOfRange(
+                          "Cannot find %s %d, please check visible devices.",
+                          device_type_, dev_id));
+
+    std::call_once(*init_flags_[dev_id], [this, dev_id] {
+      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      platform::CustomPlace place(device_type_, dev_id);
+
+      allocators_[dev_id].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::CustomAllocator(device_type_, dev_id)),
+          platform::DeviceManager::GetMinChunkSize(place),
+          platform::DeviceManager::GetMaxChunkSize(place),
+          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+    });
+
+    return allocators_[dev_id].get();
+  }
+
+ private:
+  std::string device_type_;
+  std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
+  std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
+  VLOG(10) << "GetBuddyAllocator place = " << place;
+  if (platform::is_custom_place(place)) {
+    return BuddyAllocatorList::Instance(
+               platform::PlaceHelper::GetDeviceType(place))
+        ->Get(platform::PlaceHelper::GetDeviceId(place));
+  } else {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("place must be CustomPlace"));
+  }
+}
+#endif
+
+template <>
+void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
+                                   size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  auto *buddy_allocator = GetBuddyAllocator(place);
+  auto *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    platform::DeviceGuard guard(place);
+    size_t avail, total;
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
+        "%s. ",
+        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail)));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
+                                                                    size);
+    }
+  }
+  VLOG(10) << "  pointer=" << ptr;
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+                                 size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetBuddyAllocator(place)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index b02fb6642be3fd4ade7dc1b4ed7642be28cc7757..d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,9 +25,7 @@ limitations under the License. */
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
 
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace memory {
@@ -35,12 +33,37 @@ namespace detail {
 
 BuddyAllocator::BuddyAllocator(
     std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size, size_t extra_padding_size)
+    size_t max_chunk_size, size_t extra_padding_size,
+    const std::string dev_type)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       extra_padding_size_(extra_padding_size),
       cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {}
+      system_allocator_(std::move(system_allocator)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (!dev_type.empty()) {
+    init_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetInitAllocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+    re_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetReallocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+  } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    init_allocate_size_func_ = &platform::GpuInitAllocSize;
+    re_allocate_size_func_ = &platform::GpuReallocSize;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    init_allocate_size_func_ = &platform::NPUInitAllocSize;
+    re_allocate_size_func_ = &platform::NPUReallocSize;
+#elif defined(PADDLE_WITH_MLU)
+    init_allocate_size_func_ = &platform::MLUInitAllocSize;
+    re_allocate_size_func_ = &platform::MLUReallocSize;
+#endif
+  }
+#endif
+}
 
 BuddyAllocator::~BuddyAllocator() {
   VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   size_t allocate_bytes = max_chunk_size_;
   size_t index = 0;
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  allocate_bytes = DeviceAllocateSize(init_allocate_size_func_,
+                                      re_allocate_size_func_, request_bytes);
+#else
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
                                       &platform::GpuReallocSize, request_bytes);
@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #elif defined(PADDLE_WITH_MLU)
   allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
                                       &platform::MLUReallocSize, request_bytes);
+#endif
 #endif
 
   // Allocate a new block
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 0d736f680503a6ce59e88142a9eec2ad4ebfdd26..5296192b8fd9b632be4638d47153e113fd2ae576 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -39,7 +39,8 @@ class BuddyAllocator {
  public:
   BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
                  size_t min_chunk_size, size_t max_chunk_size,
-                 size_t extra_padding_size = 0);
+                 size_t extra_padding_size = 0,
+                 const std::string dev_type = "");
 
   ~BuddyAllocator();
 
@@ -123,6 +124,9 @@ class BuddyAllocator {
   /*! Allocate CPU/GPU memory from system */
   std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::function<size_t()> init_allocate_size_func_, re_allocate_size_func_;
+#endif
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 773122de6c3198b09c33241a0d6a09e9357f65a3..a61f98c4e1a22adcc3684a9e5af190a82e3b5110 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -38,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) {
 bool MLUAllocator::UseGpu() const { return true; }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+void* CustomAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  p = device->MemoryAllocate(size);
+  if (LIKELY(p)) {
+    VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
+    *index = 0;
+    plug_alloc_size += size;
+  } else {
+    size_t avail, total;
+
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on %s %d. "
+        "total memory is %s, used memory is %s, "
+        "available memory is only %s.\n\n",
+        dev_type_, dev_id_, string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail),
+        string::HumanReadableSize(avail)));
+  }
+  return p;
+}
+
+void CustomAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, plug_alloc_size));
+  plug_alloc_size -= size;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  device->MemoryDeallocate(p, size);
+}
+
+bool CustomAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 975e2891b2472ad4aeb5c4a7d6f676c516350545..f6ff6282a614a3152dee5bd0e45ebe3b733fe14f 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+#include <string>
 
 namespace paddle {
 namespace memory {
@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomAllocator : public SystemAllocator {
+ public:
+  explicit CustomAllocator(const std::string& device_type, size_t dev_id)
+      : dev_type_(device_type), dev_id_(dev_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t plug_alloc_size = 0;
+  std::string dev_type_;
+  size_t dev_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index d2ab438fd2946701c70ea0bebf35ac33fbfb521e..d857b1c1671a789fa122a1d4115461fc0b5ba840 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -19,9 +19,88 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/place.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <>
+void Copy<platform::CPUPlace, platform::CustomPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  platform::DeviceManager::SetDevice(src_place);
+  platform::stream::Stream stream_wrapper(src_place, stream);
+  platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
+      dst, src, num, &stream_wrapper);
+}
+
+template <>
+void Copy<platform::CustomPlace, platform::CPUPlace>(
+    platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  platform::DeviceManager::SetDevice(dst_place);
+  platform::stream::Stream stream_wrapper(dst_place, stream);
+  platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
+      dst, src, num, &stream_wrapper);
+}
+
+template <>
+void Copy<platform::CustomPlace, platform::CustomPlace>(
+    platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  if (src_type == dst_type) {
+    platform::DeviceManager::SetDevice(src_place);
+    platform::stream::Stream stream_wrapper(src_place, stream);
+
+    auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
+    auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
+    if (src_id == dst_id) {
+      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
+          dst, src, num, &stream_wrapper);
+    } else {
+      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
+          dst_place, dst, src, num, &stream_wrapper);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Copy between %s and %s is not supported.", src_type, dst_type));
+  }
+}
+#endif  // PADDLE_WITH_CUSTOM_DEVICE
+
 template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
@@ -158,7 +237,7 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -168,7 +247,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
     // after async is not ok, since the async operation may not done.
@@ -186,7 +266,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -196,7 +276,8 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -211,7 +292,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -221,7 +302,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     if (stream) {
       platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               stream);
+                               reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -239,7 +320,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       // TODO(zhiqiu): support peer access?
       platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               stream);
+                               reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -284,7 +365,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
     platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
-    const void* src, size_t num, aclrtStream stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -294,7 +375,8 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -307,7 +389,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 template <>
 void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num, aclrtStream stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -317,7 +399,8 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
     // after async is not ok, since the async operation may not done.
@@ -379,6 +462,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::NPUPinnedPlace place_dst;
     platform::NPUPlace place_src(src_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -492,7 +592,7 @@ inline void SyncCUDAStream() {
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(src_place.device);
@@ -501,9 +601,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
@@ -522,7 +624,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -531,9 +633,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
@@ -552,7 +656,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -562,9 +666,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     if (stream) {
       platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
 #ifdef PADDLE_WITH_HIP
-      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream);
+      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #else
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
       platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
@@ -578,7 +684,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     if (stream) {
       platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
       platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, stream);
+                                   num, reinterpret_cast<gpuStream_t>(stream));
     } else {
       platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
       platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
@@ -620,8 +726,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPlace src_place, const void* src, size_t num,
-    gpuStream_t stream) {
+    platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -629,9 +734,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
@@ -647,7 +754,7 @@ template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num,
-    gpuStream_t stream) {
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -656,9 +763,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
@@ -674,7 +783,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
                                     pten::Place src_place, const void* src,
-                                    size_t num, gpuStream_t stream) {
+                                    size_t num, void* stream) {
   if (src_place.GetType() == pten::AllocationType::CPU &&
       dst_place.GetType() == pten::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -719,6 +828,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::CUDAPinnedPlace place_dst;
     platform::CUDAPlace place_src(src_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -726,7 +852,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, gpuStream_t stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -735,7 +861,7 @@ template <>
 void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
                                        pten::CPUPlace src_place,
                                        const void* src, size_t num,
-                                       gpuStream_t stream) {
+                                       void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -743,7 +869,7 @@ void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::GPUPlace, pten::Place>(pten::GPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, gpuStream_t stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
        src_place, src, num, stream);
 }
@@ -753,7 +879,7 @@ template <>
 void Copy<pten::Place, pten::GPUPlace>(pten::Place dst_place, void* dst,
                                        pten::GPUPlace src_place,
                                        const void* src, size_t num,
-                                       gpuStream_t stream) {
+                                       void* stream) {
   Copy(dst_place, dst,
        pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
        stream);
@@ -764,7 +890,7 @@ template <>
 void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place,
                                              void* dst, pten::Place src_place,
                                              const void* src, size_t num,
-                                             gpuStream_t stream) {
+                                             void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -773,7 +899,7 @@ template <>
 void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst,
                                              pten::GPUPinnedPlace src_place,
                                              const void* src, size_t num,
-                                             gpuStream_t stream) {
+                                             void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -800,7 +926,7 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetMLUDeviceId(src_place.device);
@@ -808,7 +934,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
-    platform::MLUMemcpyD2HAsync(dst, src, num, stream);
+    platform::MLUMemcpyD2HAsync(dst, src, num,
+                                reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -825,7 +952,7 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetMLUDeviceId(dst_place.device);
@@ -833,7 +960,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
-    platform::MLUMemcpyH2DAsync(dst, src, num, stream);
+    platform::MLUMemcpyH2DAsync(dst, src, num,
+                                reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -850,7 +978,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   if (dst_place == src_place) {
@@ -860,7 +988,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
               << " to " << dst_place << " by mlu stream(" << stream << ")";
       platform::RecordEvent record_event(
           "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
-      platform::MLUMemcpyD2DAsync(dst, src, num, stream);
+      platform::MLUMemcpyD2DAsync(dst, src, num,
+                                  reinterpret_cast<mluStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -877,7 +1006,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
               << " to " << dst_place << " by mlu stream(" << stream << ")";
       platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU");
       platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, stream);
+                                   num, reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
@@ -892,7 +1021,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
 template <>
 void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
                                     pten::Place src_place, const void* src,
-                                    size_t num, mluStream stream) {
+                                    size_t num, void* stream) {
   if (src_place.GetType() == pten::AllocationType::CPU &&
       dst_place.GetType() == pten::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -912,6 +1041,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::MLUPlace place_src(src_place.GetDeviceId());
     platform::MLUPlace place_dst(dst_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -919,7 +1065,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::MLUPlace, pten::Place>(pten::MLUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, mluStream stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
        src_place, src, num, stream);
 }
@@ -929,7 +1075,7 @@ template <>
 void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
                                        pten::MLUPlace src_place,
                                        const void* src, size_t num,
-                                       mluStream stream) {
+                                       void* stream) {
   Copy(dst_place, dst,
        pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
        stream);
@@ -939,7 +1085,7 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, mluStream stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -948,7 +1094,7 @@ template <>
 void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
                                        pten::CPUPlace src_place,
                                        const void* src, size_t num,
-                                       mluStream stream) {
+                                       void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -1013,7 +1159,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
   }
 #endif
 #ifdef PADDLE_WITH_IPU
-  else if (src_place.GetType() == pten::AllocationType::CPU &&
+  else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == pten::AllocationType::IPU) {
     platform::IPUPlace place_dst(dst_place.GetDeviceId());
     platform::CPUPlace place_src;
@@ -1048,5 +1194,48 @@ void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num);
 }
 
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
+    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) &&     \
+    !defined(PADDLE_WITH_MLU)
+
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num, void* stream) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+      dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  }
+}
+
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, void* stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       void* stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index 31d1a50e778f8c86400163a774af6dc04dce10ed..dd861a15b5c7b03e932eff8747668268b14618ef 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -36,66 +36,25 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or GPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or GPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   CUDA stream.
- *
- * \note    For GPU memory copy, CUDA stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          gpuStream_t stream);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or NPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or NPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   NPU stream.
- *
- * \note    For NPU memory copy, NPU stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          aclrtStream stream);
-#endif
-
-#ifdef PADDLE_WITH_MLU
 /**
  * \brief   Copy memory from one place to another place.
  *
- * \param[in]  DstPlace Destination allocation place (CPU or MLU).
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU or XPU or
+ * CustomDevice).
  * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or MLU).
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU or XPU or
+ * CustomDevice).
  * \param[in]  src      Source memory address.
  * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   MLU stream.
+ * \param[in]  stream   stream for asynchronously memory copy.
  *
- * \note    For MLU memory copy, MLU stream need to be specified
- *          for asynchronously memory copy.
+ * \note    For GPU/XPU/CustomDevice memory copy, stream need to be specified
+ *          for asynchronously memory copy, and type is restored in the
+ *          implementation.
  *
  */
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          mluStream stream);
-#endif
-
+          void* stream);
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index b87cdf6f6df19314342a24c98032d3856f0d3779..a279c76430f1b046a4c3ca05485824d5e3b62de2 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,5 +1,9 @@
 include(operators)
 
+# solve "math constants not defined" problems caused by the order of inclusion 
+# of <cmath> and the definition of macro _USE_MATH_DEFINES
+add_definitions(-D_USE_MATH_DEFINES)
+
 # clean cache and pybind_file content first when rebuild
 unset(GLOB_OP_LIB CACHE)
 unset(OP_LIBRARY CACHE)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 5a09933e0ee24889f5192a8c84e449e09bdc147e..149a87fe32da16e850d5d64fb519c9bde7afef62 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/abs_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -108,7 +107,7 @@ class AbsDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> op) const override {
-    op->SetType("abs_grad_grad");
+    op->SetType("abs_double_grad");
     // input1: x
     op->SetInput("X", this->Input("X"));
     // input2: ddx
@@ -159,37 +158,4 @@ REGISTER_OPERATOR(abs_grad, ops::AbsGradOp,
                   ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
 
-REGISTER_OPERATOR(abs_grad_grad, ops::AbsDoubleGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    abs, ops::AbsKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AbsKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AbsKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::AbsKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    abs_grad, ops::AbsGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AbsGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    abs_grad_grad,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::float16>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>);
+REGISTER_OPERATOR(abs_double_grad, ops::AbsDoubleGradOp);
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
deleted file mode 100644
index 882c8547a04154778389bd7cd77531b63d19915b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/abs_op.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/abs_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Enable = void>
-struct CudaAbsFunctor;
-
-template <typename T>
-struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
-  __device__ __forceinline__ math::Real<T> operator()(const T x) const {
-    return abs(x);
-  }
-};
-
-template <typename T>
-struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
-  __device__ __forceinline__ T operator()(const T x) const {
-    return std::abs(x);
-  }
-};
-
-template <typename T>
-class AbsKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<math::Real<T>>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    auto functor = CudaAbsFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<math::Real<T>>(
-        dev_ctx, ins, &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    abs, ops::AbsKernel<plat::CUDADeviceContext, float>,
-    ops::AbsKernel<plat::CUDADeviceContext, double>,
-    ops::AbsKernel<plat::CUDADeviceContext, int>,
-    ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, double>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, int>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/abs_op.h b/paddle/fluid/operators/abs_op.h
deleted file mode 100644
index c79e83314f3bd39dcf6736e66c0b12956a2b0e81..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/abs_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AbsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(x->numel() * sizeof(math::Real<T>)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::AbsFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AbsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<math::Real<T>>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::AbsGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AbsDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* ddx = ctx.Input<framework::Tensor>("DDX");
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* ddout = ctx.Output<framework::Tensor>("DDOut");
-
-    auto numel = ddx->numel();
-    auto* ddx_data = ddx->data<T>();
-    auto* x_data = x->data<T>();
-    auto* ddout_data = ddout->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::AbsGradGradFunctor<T> functor(ddx_data, x_data, ddout_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
index cc2b0925c21e527c6835822161f2dcfd959b1b2d..30ec22cf6d868381a4a78585dc3620a2ea78d466 100644
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include "paddle/fluid/operators/abs_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index 093a04f03df95681d7837d5c44717c678589e679..1e0dc803d76123573332b040bc29ece263c11d80 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -17,7 +17,7 @@
 #define _USE_MATH_DEFINES
 #endif
 #include <cmath>
-#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -26,81 +26,6 @@
 namespace paddle {
 namespace operators {
 
-namespace math {
-template <typename T, typename Enable = void>
-struct AngleFunctor;
-
-// angel function for complex
-template <typename T>
-struct AngleFunctor<T, Complex<T, Real<T>>> {
-  AngleFunctor(const T* input, Real<T>* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = arg(input_[idx]);
-  }
-
-  const T* input_;
-  Real<T>* output_;
-  int64_t numel_;
-};
-
-// angel function for real
-template <typename T>
-struct AngleFunctor<T, NoComplex<T, Real<T>>> {
-  AngleFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = input_[idx] < static_cast<T>(0) ? M_PI : 0;
-  }
-
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T, typename Enable = void>
-struct AngleGradFunctor;
-
-// angle grad for complex
-template <typename T>
-struct AngleGradFunctor<T, Complex<T, Real<T>>> {
-  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
-      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == T(0)) {
-      dx_[idx] = T(0);
-    } else {
-      const math::Real<T> r_square =
-          x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
-      dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
-                   dout_[idx] * x_[idx].real / r_square);
-    }
-  }
-
-  const math::Real<T>* dout_;
-  const T* x_;
-  T* dx_;
-  int64_t numel_;
-};
-
-// angle grad for real
-template <typename T>
-struct AngleGradFunctor<T, NoComplex<T, Real<T>>> {
-  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
-      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
-
-  const math::Real<T>* dout_;
-  const T* x_;
-  T* dx_;
-  int64_t numel_;
-};
-}  // namespace math
-
 using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class AngleKernel : public framework::OpKernel<T> {
@@ -111,12 +36,12 @@ class AngleKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(x->numel() * sizeof(math::Real<T>)));
+    auto* out_data = out->mutable_data<pten::funcs::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(pten::funcs::Real<T>)));
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::AngleFunctor<T> functor(x_data, out_data, numel);
+    pten::funcs::AngleFunctor<T> functor(x_data, out_data, numel);
     for_range(functor);
   }
 };
@@ -132,14 +57,14 @@ class AngleGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<math::Real<T>>();
+    auto* dout_data = d_out->data<pten::funcs::Real<T>>();
     auto* x_data = x->data<T>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::AngleGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+    pten::funcs::AngleGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
     for_range(functor);
   }
 };
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index f134bd0cd3c7a565019c92bf08ee4c565ba67ac5..565b1cee9f7852caa1d8de3d4bac67d6669b327a 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/transform.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index 5004aad7c59bc4ad194bf961aaad6326ad03fd38..2c92969225f3bcbb8008c24c21e4a6f80dd03fd4 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -64,7 +64,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
   // calculate u's conjugate for complex
   framework::Tensor u_conj(u_bst.type());
   platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
-  math::ConjFunctor<T> u_functor(
+  pten::funcs::ConjFunctor<T> u_functor(
       u_bst.data<T>(), u_bst.numel(),
       u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
   u_for_range(u_functor);
@@ -73,7 +73,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
   // calculate b's conjugate for complex
   framework::Tensor b_conj(b_bst.type());
   platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
-  math::ConjFunctor<T> b_functor(
+  pten::funcs::ConjFunctor<T> b_functor(
       b_bst.data<T>(), b_bst.numel(),
       b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
   b_for_range(b_functor);
@@ -113,7 +113,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
 
   // calculate out's conjugate for complex
   platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-  math::ConjFunctor<T> out_functor(
+  pten::funcs::ConjFunctor<T> out_functor(
       out->data<T>(), out->numel(),
       out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
   out_for_range(out_functor);
@@ -173,7 +173,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       // calculate out's conjugate for complex
       framework::Tensor out_conj(out->type());
       platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      math::ConjFunctor<T> out_functor(
+      pten::funcs::ConjFunctor<T> out_functor(
           out->data<T>(), out->numel(),
           out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
       out_for_range(out_functor);
@@ -195,7 +195,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       framework::Tensor commonterm_conj(commonterm.type());
       platform::ForRange<DeviceContext> commonterm_for_range(
           dev_ctx, commonterm.numel());
-      math::ConjFunctor<T> commonterm_functor(
+      pten::funcs::ConjFunctor<T> commonterm_functor(
           commonterm.data<T>(), commonterm.numel(),
           commonterm_conj.mutable_data<T>(commonterm.dims(),
                                           dev_ctx.GetPlace()));
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index ed3a7598bdab6b288d280c13af79f16ff0a84e46..b80916616a18b7521d6ae32711ca247fdfd3e403 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,8 +1,10 @@
 include(operators)
-register_operators(EXCLUDES cinn_launch_op)
 
+cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
 cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn)
-op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS string_helper cinn cinn_compiler cinn_launch_context)
+
+SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
+register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
   cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context)
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edf854a9c95b088225ac0eb225f056f0c531c393
--- /dev/null
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle::operators {
+
+class CinnInstructionRunOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
+    OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
+                   "CinnInstructionRun");
+    const CinnCompiledObject& compiled_object =
+        CinnCompiler::GetInstance()->GetCompiledObject(
+            ctx->Attrs().Get<int64_t>(kCachedIndex));
+
+    details::CinnLaunchContext* launch_context =
+        compiled_object.launch_context.get();
+    std::vector<std::string> output_args = ctx->Outputs(kOutputs);
+    std::vector<framework::DDim> output_dims(output_args.size());
+    std::transform(output_args.begin(), output_args.end(), output_dims.begin(),
+                   [launch_context](const std::string& var_name) {
+                     cinn_buffer_t* buffer =
+                         launch_context->GetCinnBufferOfVar(var_name);
+                     return framework::DDim(buffer->dims, buffer->dimensions);
+                   });
+    ctx->SetOutputsDim(kOutputs, output_dims);
+  }
+};
+
+class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(kX,
+             "(vector<LoDTensor>)"
+             "which are the input arguments of this cinn instruction")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "(vector<LoDTensor>)"
+              "which are the output arguments of this cinn instruction")
+        .AsDuplicable();
+    AddAttr<int64_t>(
+        kCachedIndex,
+        "(int64_t)"
+        "the stored index of the cached compilation result in CinnCompiler,"
+        "which is used to fetch the CinnCompiledObject where this cinn "
+        "instruction is included");
+    AddAttr<int64_t>(
+        kInstructionIndex,
+        "(int64_t)"
+        "the index of this instruction to the cinn runtime program");
+    AddComment(R"DOC(
+CinnInstructionRun Operator.
+
+This operator is used to launch a
+CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md) instruction execution
+
+Both the input and output of this operator are a set of variables
+which are the input and output arguments of the bound cinn instruction respectively.
+In addition, there is an attribute named 'cached_index' should be
+set necessarily to get the CinnCompiledObject where the instruction is included 
+and 'instruction_index' is fetch the instruction object from complied runtime prograrm.
+
+It accomplishes the execution of the instruction according to the following steps:
+  0. Set the shapes ot the output variables at InferShape function with
+     compilation result.
+  1. Fetch the cinn instruction bound to this operator by 'cached_index'
+     and 'instruction_index' from CinnCompiler.
+  2. Prepare the input and output variables of the instruction in Paddle and share
+     their buffers to CINN by setting 'memory' of according cinn_buffer_t.
+  3. Launch CINN runtime to execute the instruction.
+
+)DOC");
+  }
+};
+
+}  // namespace paddle::operators
+
+namespace ops = paddle::operators;
+using CPUDeviceContext = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(
+    cinn_instruction_run, ops::CinnInstructionRunOp,
+    ops::CinnInstructionRunOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    cinn_instruction_run,
+    ops::CinnInstructionRunOpKernel<CPUDeviceContext, bool>,
+    ops::CinnInstructionRunOpKernel<CPUDeviceContext, int>,
+    ops::CinnInstructionRunOpKernel<CPUDeviceContext, int64_t>,
+    ops::CinnInstructionRunOpKernel<CPUDeviceContext, float>,
+    ops::CinnInstructionRunOpKernel<CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1b00a182067b909a08fa50744bacfde39c5c830
--- /dev/null
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(
+    cinn_instruction_run,
+    ops::CinnInstructionRunOpKernel<CUDADeviceContext, bool>,
+    ops::CinnInstructionRunOpKernel<CUDADeviceContext, int>,
+    ops::CinnInstructionRunOpKernel<CUDADeviceContext, int64_t>,
+    ops::CinnInstructionRunOpKernel<CUDADeviceContext, float>,
+    ops::CinnInstructionRunOpKernel<CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8847faa944bef228e418c347c486fa2b42090eed
--- /dev/null
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+
+namespace paddle::operators {
+
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
+using CinnCompiler = framework::paddle2cinn::CinnCompiler;
+
+template <typename DeviceContext, typename T>
+class CinnInstructionRunOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // step 1: fetch the cinn instruction bound to this operator
+    auto cached_index = ctx.template Attr<int64_t>(kCachedIndex);
+    auto ins_index = ctx.template Attr<int64_t>(kInstructionIndex);
+    const CinnCompiledObject& compiled_object =
+        CinnCompiler::GetInstance()->GetCompiledObject(cached_index);
+    const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
+        compiled_object.runtime_program->GetRunInstructions();
+    PADDLE_ENFORCE_LT(ins_index, instructions.size(),
+                      platform::errors::InvalidArgument(
+                          "Index(%ld) > instructions.size(%ld).", ins_index,
+                          instructions.size()));
+    auto&& instruction = instructions.at(ins_index);
+
+    // step 2: prepare the input and output arguments of the instruction
+    details::CinnLaunchContext* launch_context =
+        compiled_object.launch_context.get();
+    auto share_argument_buffer_fn = [launch_context,
+                                     &ctx](const std::string& var_name) {
+      cinn_buffer_t* buffer = launch_context->GetCinnBufferOfVar(var_name);
+      framework::Variable* var = ctx.scope().GetVar(var_name);
+      auto* tensor = var->template GetMutable<framework::LoDTensor>();
+      buffer->memory =
+          reinterpret_cast<uint8_t*>(tensor->mutable_data<T>(ctx.GetPlace()));
+    };
+    std::vector<std::string> in_args = ctx.InputNames(kX);
+    std::for_each(in_args.begin(), in_args.end(), share_argument_buffer_fn);
+    std::vector<std::string> out_args = ctx.OutputNames(kOutputs);
+    std::for_each(out_args.begin(), out_args.end(), share_argument_buffer_fn);
+
+    // step 3: launch CINN runtime to execute the instruction
+    // TODO(CtfGo): simplify format of arguments package as a vector in CINN
+    // and update this usage call
+    instruction->Run(&launch_context->FinalizeArguments(), false,
+                     details::GetStream<DeviceContext>(ctx));
+  }
+};
+
+}  // namespace paddle::operators
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index fa93bf00f2ac0dcd0c3dcb778357dda7d9ce3518..282a8f69e4ec5c194bf5226132ced33ad02ac676 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -24,12 +24,31 @@ CinnLaunchContext::CinnLaunchContext(
     const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
     const std::shared_ptr<CinnScope>& cinn_scope)
     : paddle2cinn_varmap_(paddle2cinn_varmap), cinn_scope_(cinn_scope) {
+  // generate all names of cinn used variables
   auto var_names = cinn_scope_->var_names();
   cinn_variable_names_.reserve(var_names.size());
   std::transform(
       var_names.begin(), var_names.end(),
       std::inserter(cinn_variable_names_, cinn_variable_names_.end()),
       [](const auto& name_view) { return std::string(name_view.data()); });
+  // build the variable name map of cinn2paddle
+  for (const auto& x : paddle2cinn_varmap_) {
+    auto res = cinn2paddle_varmap_.emplace(x.second, x.first);
+    PADDLE_ENFORCE_EQ(
+        res.second, true,
+        platform::errors::InvalidArgument(
+            "Cinn variable(%s) maps to more than one paddle variable(%s,%s)",
+            x.second, res.first->second, x.first));
+  }
+  // supplement the relations of the remain variables not appearing in above
+  // map,
+  // they are internal variables and here we use the name from cinn compiled.
+  for (const auto& var_name : cinn_variable_names_) {
+    if (!cinn2paddle_varmap_.count(var_name)) {
+      cinn2paddle_varmap_.emplace(var_name, var_name);
+      paddle2cinn_varmap_.emplace(var_name, var_name);
+    }
+  }
 }
 
 void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
@@ -189,6 +208,20 @@ CinnLaunchContext::FinalizeArguments() const {
   return name2argument_;
 }
 
+cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
+    const std::string& paddle_var_name) {
+  auto res = paddle2cinn_varmap_.find(paddle_var_name);
+  PADDLE_ENFORCE_NE(
+      res, paddle2cinn_varmap_.end(),
+      platform::errors::InvalidArgument(
+          "Variable(%s) not found in compilation result", paddle_var_name));
+  auto it = name2argument_.find(res->second);
+  PADDLE_ENFORCE_NE(it, name2argument_.end(),
+                    platform::errors::InvalidArgument(
+                        "Argument(%s) not be initialized", res->second));
+  return static_cast<cinn_buffer_t*>(it->second);
+}
+
 }  // namespace details
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 7b71d77d8b8860264872b88d86a5cfe7ae82be96..71ddeb35420b52c12787cb3873fbe5b7d4f7b8c1 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -64,6 +64,8 @@ class CinnLaunchContext {
   // Finalize all execution arguments and return them
   const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
 
+  cinn_buffer_t* GetCinnBufferOfVar(const std::string& paddle_var_name);
+
  private:
   // Get CinnTensor with CINN variable name
   CinnTensor GetCinnTensor(const std::string& var_name);
@@ -84,19 +86,22 @@ class CinnLaunchContext {
   std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
 
   // a variable name map from paddle to cinn
-  const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
+  std::unordered_map<std::string, std::string> paddle2cinn_varmap_;
+  // a variable name map from cinn to paddle
+  std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
   // the variable scope of cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
-  // all variables used by compiled executable program
+  // all names of cinn variables used by compiled executable program
   std::unordered_set<std::string> cinn_variable_names_;
 
   // because a cinn_pod_value_t does not own the cinn_buffer_t object,
   // an extra stroage is necessary to keep the object and it can
-  // not be released until runtime program finish  execution.
+  // not be released until the runtime program finish execution.
   std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
 
-  // name to execution argument
+  // this map saves all execution arguments with their cinn names as key,
+  // and it is passed to the Execute interface of a cinn runtime program.
   std::map<std::string, cinn_pod_value_t> name2argument_;
 };
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index cd17c947228d6201b551410172246498f75f3b12..d918b7216c4d2f1e8cd0891d3a0dc0a5d2ed4339 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
-
 #include <functional>
 #include <vector>
-
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/flags.h"
 #include "paddle/fluid/string/string_helper.h"
 
 DECLARE_bool(cudnn_deterministic);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index ea36a19202ef0696918792210f20dd2c2818e700..9dfd53834e937e201a76d44d4a841f4625c24b19 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -13,36 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
-#include <memory>
-#include <vector>
-#include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
-
-namespace paddle {
-namespace operators {
-namespace details {
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx) {
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  return dev_ctx.stream();
-}
-#endif
-
-}  // namespace details
-}  // namespace operators
-}  // namespace paddle
 
 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(cinn_launch,
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 23dfa9d84c01203f3edbef6216cccbc340ffda52..bd9b30f559bdb5e6af3081125d9278ad21046cd7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -18,27 +18,18 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/flags.h"
+#include "cinn/common/target.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kX[] = "X";
-constexpr char kNoNeedBufferX[] = "NoNeedBufferX";
-constexpr char kOutputs[] = "Out";
-constexpr char kCompilationKey[] = "compilation_key";
-
 using LoDTensor = framework::LoDTensor;
-using CinnTensor = ::cinn::hlir::framework::Tensor;
-using CinnScope = ::cinn::hlir::framework::Scope;
 using CinnCompiler = framework::paddle2cinn::CinnCompiler;
 using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
@@ -57,17 +48,6 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();
 
-template <typename DeviceContext>
-void* GetStream(const framework::ExecutionContext& ctx) {
-  return nullptr;
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx);
-#endif
-
 }  // namespace details
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 849cdb715049ba235f737117e0769ec0a9105942..b4cd91ea8a4bce6f8a2bbeb01d15f03cb5053de7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 
 USE_OP(cinn_launch);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb9c822c77c4ddb631a31610af0cc950c7533a8
--- /dev/null
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle::operators::details {
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+void* GetStream<platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx) {
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  return dev_ctx.stream();
+}
+#endif
+
+}  // namespace paddle::operators::details
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..e542134b94689692e88382b6506a9d87d4708fa2
--- /dev/null
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+
+// We define some common names or utility functions
+// for operators related to cinn in this file
+namespace paddle::operators {
+
+// input params, output params and attributes
+constexpr char kX[] = "X";
+constexpr char kNoNeedBufferX[] = "NoNeedBufferX";
+constexpr char kOutputs[] = "Out";
+constexpr char kCompilationKey[] = "compilation_key";
+constexpr char kCachedIndex[] = "cached_index";
+constexpr char kInstructionIndex[] = "instruction_index";
+
+// utility functions
+namespace details {
+
+template <typename DeviceContext>
+void* GetStream(const framework::ExecutionContext& ctx) {
+  return nullptr;
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+void* GetStream<platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx);
+#endif
+
+}  // namespace details
+}  // namespace paddle::operators
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
index 9d27d99b3ab35835330e629f21502d05d635103a..199e2b6bc7fc6cb3ec82c550058c8df14980fc01 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
 
diff --git a/paddle/fluid/operators/complex_op.h b/paddle/fluid/operators/complex_op.h
index 3dd5ea9f7e83dbfaa353378cfee10231c445c222..fb324277fb004b93718793346957b9adbb10143b 100644
--- a/paddle/fluid/operators/complex_op.h
+++ b/paddle/fluid/operators/complex_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h
index 9a8d89db4020828cc9fdba90dd99ab7e5395864b..98ba732e2400421073fdbefc76ff4207fe5a9a8d 100644
--- a/paddle/fluid/operators/complex_view_op.h
+++ b/paddle/fluid/operators/complex_view_op.h
@@ -17,9 +17,9 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 68720e70b09ad6098da4fd59c50bbb89a56c9dc7..a044506cef4bb480d30bc87f3b556560a4d61064 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -265,6 +265,18 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
   }
 };
 
+class ConditionalBlockGradInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    // NOTE(Aurelius84): VarType of Output is LoDTensor by default. In case of
+    // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's
+    // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as
+    // Input@GRAD.
+    ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
+                             framework::GradVarName(ConditionalOp::kInputs));
+  }
+};
+
 template <typename T>
 class ConditionalBlockGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -300,4 +312,5 @@ REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
                   ops::ConditionalBlockOpProtoMaker,
                   ops::ConditionalBlockGradMaker<paddle::framework::OpDesc>);
 REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
-                  ops::ConditionalBlockGradInferShape);
+                  ops::ConditionalBlockGradInferShape,
+                  ops::ConditionalBlockGradInferVarType);
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
index 2b69db7d24a128de77e8508970705ac3b9a1fb66..3a63bd99ad57d8c91235ddf50219e6b015321972 100644
--- a/paddle/fluid/operators/cumprod_op.cu
+++ b/paddle/fluid/operators/cumprod_op.cu
@@ -14,9 +14,9 @@
 
 #include <thrust/transform.h>
 #include "paddle/fluid/operators/cumprod_op.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/inclusive_scan.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -243,12 +243,12 @@ class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
 
       platform::ForRange<platform::CUDADeviceContext> for_range_x(dev_ctx,
                                                                   numel);
-      math::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+      pten::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
       for_range_x(functor_x);
 
       platform::ForRange<platform::CUDADeviceContext> for_range_y(dev_ctx,
                                                                   numel);
-      math::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
+      pten::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
       for_range_y(functor_y);
       x_data_deal = x_data_conj;
       y_data_deal = y_data_conj;
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
index d8c3c1febdcf3ef336bc7c68ad9636bd9989c22e..15c3d514331b671817c97b0036a00b1279263dbb 100644
--- a/paddle/fluid/operators/cumprod_op.h
+++ b/paddle/fluid/operators/cumprod_op.h
@@ -18,8 +18,8 @@
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -124,12 +124,12 @@ class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
 
       platform::ForRange<platform::CPUDeviceContext> for_range_x(dev_ctx,
                                                                  numel);
-      math::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+      pten::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
       for_range_x(functor_x);
 
       platform::ForRange<platform::CPUDeviceContext> for_range_out(dev_ctx,
                                                                    numel);
-      math::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
+      pten::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
       for_range_out(functor_out);
 
       x_data_deal = x_data_conj;
diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
index bab394689546e495a0f7892870c071f0fb7b3f06..c39f702a48644e529c429854125c386af2f3224d 100644
--- a/paddle/fluid/operators/detection/prior_box_op_xpu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/detection/prior_box_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -81,21 +82,17 @@ class PriorBoxOpXPUKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param,
         max_sizes_param, feature_height, feature_width, img_height, img_width,
         offset, step_height, step_width, clip, min_max_aspect_ratios_order);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU gen_prior_box kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gen_prior_box");
 
     int box_num = feature_height * feature_width * num_priors;
     int vlen = variances.size();
+    std::vector<K> var_cpu(vlen * box_num);
     for (int i = 0; i < box_num; ++i) {
-      ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K),
-                       XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
-                                              "XPU xpu_memcpy return wrong "
-                                              "value[%d %s] in prior_box.",
-                                              ret, XPUAPIErrorMsg[ret]));
+      std::copy(variances.begin(), variances.end(), var_cpu.begin() + i * vlen);
     }
+    ret = xpu_memcpy(vars_data, var_cpu.data(), var_cpu.size() * sizeof(K),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_XPU_SUCCESS(ret);
   }
 };
 
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 90443e0928ba2535498122ea00df479b83acb56f..1da680fbd953a7f86cef4e9db13d1e63336f9a29 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,11 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -395,7 +395,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
                                                      size_t(numel * sizeof(T)));
 
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
+    pten::funcs::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
     for_range(functor);
 
     VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index c5d43ef01264b097ffb9c17bc716bd3dcedf8ce0..52fc26342a1b441dff032c13e0b423504a77265e 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,8 +16,8 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index b4b6e2ce2fc5664e8016e7815e037933d461e80d..f822802d305e9b03ac9e00604121033c7a1faa6d 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -17,12 +17,12 @@
 #include <math.h>
 #include <algorithm>
 #include <complex>
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #define EPSILON 1e-6
 
@@ -87,18 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
   int values_stride = values->dims()[values->dims().size() - 1];
 
   Tensor rwork;
-  math::Real<T>* rwork_data = nullptr;
+  pten::funcs::Real<T>* rwork_data = nullptr;
 
   rwork.Resize(framework::make_ddim({lda * 2}));
-  rwork_data = rwork.mutable_data<math::Real<T>>(context.GetPlace());
+  rwork_data = rwork.mutable_data<pten::funcs::Real<T>>(context.GetPlace());
 
   // call lapackEig once to compute the size of work;
   T computed_work_size;
-  math::lapackEig<T, math::Real<T>>(
+  math::lapackEig<T, pten::funcs::Real<T>>(
       jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
       rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
 
-  lwork = std::max<int>(1, static_cast<int>(math::Real<T>(computed_work_size)));
+  lwork = std::max<int>(
+      1, static_cast<int>(pten::funcs::Real<T>(computed_work_size)));
   Tensor work;
   work.Resize(framework::make_ddim({lwork}));
   T* work_data = work.mutable_data<T>(context.GetPlace());
@@ -108,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
     T* current_values = &values_data[i * values_stride];
     T* current_rvectors = &rvector_data[i * matrix_stride];
 
-    math::lapackEig<T, math::Real<T>>(
+    math::lapackEig<T, pten::funcs::Real<T>>(
         jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
         ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
     PADDLE_ENFORCE_EQ(
@@ -207,26 +208,27 @@ class EigKernel : public framework::OpKernel<T> {
       origin_dim.push_back(last_item * 2);
       framework::DDim big_dim = framework::make_ddim(origin_dim);
 
-      real_values.mutable_data<math::Real<T>>(big_dim, context.GetPlace());
-      real_vectors.mutable_data<math::Real<T>>(x->dims(), context.GetPlace());
+      real_values.mutable_data<pten::funcs::Real<T>>(big_dim,
+                                                     context.GetPlace());
+      real_vectors.mutable_data<pten::funcs::Real<T>>(x->dims(),
+                                                      context.GetPlace());
 
-      ApplyEigKernel<DeviceContext, math::Real<T>>(*x, &real_values,
-                                                   &real_vectors, context);
-      auto dito =
-          math::DeviceIndependenceTensorOperations<DeviceContext, math::Real<T>,
-                                                   Tout>(context);
+      ApplyEigKernel<DeviceContext, pten::funcs::Real<T>>(
+          *x, &real_values, &real_vectors, context);
+      auto dito = math::DeviceIndependenceTensorOperations<
+          DeviceContext, pten::funcs::Real<T>, Tout>(context);
 
       // 1. extract real part & imag part from real_values
       Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
       Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
 
       // 2. construct complex values
-      auto* real_part_data = real_part.data<math::Real<T>>();
-      auto* imag_part_data = imag_part.data<math::Real<T>>();
+      auto* real_part_data = real_part.data<pten::funcs::Real<T>>();
+      auto* imag_part_data = imag_part.data<pten::funcs::Real<T>>();
       int out_values_numel = out_values->numel();
       platform::ForRange<DeviceContext> for_range(
           context.template device_context<DeviceContext>(), out_values_numel);
-      math::RealImagToComplexFunctor<Tout> functor(
+      pten::funcs::RealImagToComplexFunctor<Tout> functor(
           real_part_data, imag_part_data,
           out_values->mutable_data<Tout>(context.GetPlace()), out_values_numel);
       for_range(functor);
@@ -235,7 +237,7 @@ class EigKernel : public framework::OpKernel<T> {
       Tensor real_vector_trans = dito.Transpose(real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
-      ConstructComplexVectors<math::Real<T>, Tout>(
+      ConstructComplexVectors<pten::funcs::Real<T>, Tout>(
           &out_vectors_trans, *out_values, real_vector_trans, context,
           batch_count, order);
       TransposeTwoAxis<DeviceContext, Tout>(out_vectors_trans, out_vectors,
@@ -271,14 +273,14 @@ void ComputeBackwardForComplexInput(
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<math::Real<Tout>>();
+  auto* data_diag_un = diag_unsqueezed.data<pten::funcs::Real<Tout>>();
   auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
       diag_unsqueezed.dims(), context.GetPlace(),
       static_cast<size_t>(numel * sizeof(Tout)));
   auto& dev_ctx = context.template device_context<DeviceContext>();
   platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  math::RealToComplexFunctor<Tout> functor(data_diag_un, data_diag_un_com,
-                                           numel);
+  pten::funcs::RealToComplexFunctor<Tout> functor(data_diag_un,
+                                                  data_diag_un_com, numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
   Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
index ad9b0f598311b1f44ae3f7ec34bda8489e422fc7..77afaf681da939cdc089325d5e210e5960142f55 100644
--- a/paddle/fluid/operators/eigh_op.h
+++ b/paddle/fluid/operators/eigh_op.h
@@ -40,7 +40,7 @@ template <typename DeviceContext, typename T>
 class EighGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = math::Real<T>;
+    using ValueType = pten::funcs::Real<T>;
     auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad.mutable_data<T>(ctx.GetPlace());
     auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index d825833b0242240cb0bdbeaf1e85057fe23dc618..a069ea164c94c5c9ec50ece0d28042347ff4777f 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -20,9 +20,9 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -48,7 +48,7 @@ struct PaddleComplex<
 template <typename T>
 using PaddleCType = typename PaddleComplex<T>::type;
 template <typename T>
-using Real = typename math::Real<T>;
+using Real = typename pten::funcs::Real<T>;
 
 static void SpiltBatchSquareMatrix(const Tensor& input,
                                    std::vector<Tensor>* output) {
@@ -118,7 +118,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
 
   platform::ForRange<DeviceContext> for_range(
       ctx.template device_context<DeviceContext>(), n_dim);
-  math::RealImagToComplexFunctor<PaddleCType<T>> functor(
+  pten::funcs::RealImagToComplexFunctor<PaddleCType<T>> functor(
       w_data, w_data + n_dim, output->template data<PaddleCType<T>>(), n_dim);
   for_range(functor);
 }
@@ -143,7 +143,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_work_mem, work_mem));
 
   int64_t rwork_mem = rwork->memory_size();
-  int64_t required_rwork_mem = (n_dim << 1) * sizeof(Real<T>);
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(pten::funcs::Real<T>);
   PADDLE_ENFORCE_GE(
       rwork_mem, required_rwork_mem,
       platform::errors::InvalidArgument(
@@ -153,11 +153,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_rwork_mem, rwork_mem));
 
   int info = 0;
-  math::lapackEig<T, Real<T>>(
+  math::lapackEig<T, pten::funcs::Real<T>>(
       'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
       static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
       work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
-      rwork->template data<Real<T>>(), &info);
+      rwork->template data<pten::funcs::Real<T>>(), &info);
 
   std::string name = "framework::platform::dynload::cgeev_";
   if (framework::TransToProtoVarType(input.dtype()) ==
@@ -187,10 +187,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
     // query workspace size
     T qwork;
     int info;
-    math::lapackEig<T, Real<T>>('N', 'N', static_cast<int>(n_dim),
-                                input_matrices[0].template data<T>(),
-                                static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1,
-                                &qwork, -1, static_cast<Real<T>*>(NULL), &info);
+    math::lapackEig<T, pten::funcs::Real<T>>(
+        'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
+        static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
+        static_cast<pten::funcs::Real<T>*>(NULL), &info);
     int64_t lwork = static_cast<int64_t>(qwork);
 
     Tensor work, rwork;
@@ -207,8 +207,8 @@ class EigvalsKernel : public framework::OpKernel<T> {
     }
     if (framework::IsComplexType(
             framework::TransToProtoVarType(input->dtype()))) {
-      rwork.mutable_data<Real<T>>(framework::make_ddim({n_dim << 1}),
-                                  ctx.GetPlace());
+      rwork.mutable_data<pten::funcs::Real<T>>(
+          framework::make_ddim({n_dim << 1}), ctx.GetPlace());
     }
 
     for (int64_t i = 0; i < n_batch; ++i) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index f462c2ea0720b600f238109704e9606a2f7d627c..53037c1fa653648044e2dc0981ec5c63351e7c15 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -117,55 +117,6 @@ REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad,
                   ops::ElementwiseTripleGradOpInplaceInferer,
                   ops::ElementwiseTripleGradNoBufVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_triple_grad,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 // A specialization elementwise_add operator, used in gradient accumulation with
 // inplace addto.
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 2326aa561eaa05986c6e58bc1f2f2c93334cf893..b66cd01349d1ecb76307a6d6a24cf9b08d69cfb4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -18,51 +18,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-namespace paddle {
-namespace operators {}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_triple_grad,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-
 REGISTER_OP_CUDA_KERNEL(
     grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 73415d3fdb5c83cac1c0a8afb67548d7fa09b3c3..6f2a1fe87d70913f3699ead365e53923a7eaf83d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -43,73 +43,5 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    pten::AddGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *x, *y, *dout, axis, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>("DOut");
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-
-    auto *ddout = ctx.Output<Tensor>("DDOut");
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    paddle::optional<const pten::DenseTensor &> ddx_optional = paddle::none;
-    paddle::optional<const pten::DenseTensor &> ddy_optional = paddle::none;
-    if (ddx != nullptr) {
-      ddx_optional = *ddx;
-    }
-    if (ddy != nullptr) {
-      ddy_optional = *ddy;
-    }
-    pten::AddDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-    auto *d_ddout = ctx.Input<Tensor>("D_DDOut");
-    auto *d_ddx = ctx.Output<Tensor>("D_DDX");
-    auto *d_ddy = ctx.Output<Tensor>("D_DDY");
-
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    pten::AddTripleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 63ec5bd4a2805e74b8a6552a53ac65fb55a0cdf5..4732762624a5f820698d228fb105529d845af049 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index 12d82654362ac125502a1b4b73c34226647ec99e..7efa1d24dcf1fe3c62d3177321e4c5e98e8f267d 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 706475bc82fadef0eaf864d69fe3ceccb087d6f2..e1340de2096e08bcfc8d3010a87d56be869c749e 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 551d8ee6592dfcf39e15b5d5c3b40453847fb64d..94a6ba3139b1d700cfb7f3ce2cd02424da3f63bb 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -27,7 +27,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
 USE_OP(matmul);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 
 // get paddle matmul op results as baseline
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
old mode 100755
new mode 100644
index 0adbf0be4e28aa1d95b92a273f2a78851ca196ed..e34335e8597a75d594dc7271207f95c1599b2083
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -20,12 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
-#include "paddle/fluid/operators/math/functors.h"
+#include "paddle/pten/kernels/funcs/functors.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
-namespace math = paddle::operators::math;
 
 /**
  * @brief the unittest of fused_dropout_act_bias
@@ -283,12 +282,14 @@ static void BaseTest(const bool is_fp16 = false) {
 }
 
 TEST(FusedDropout, GPUFusedDorpoutActBias) {
-  BaseTest<float, math::ReluFunctor<float>, math::ReluGradFunctor<float>>();
+  BaseTest<float, pten::funcs::ReluFunctor<float>,
+           pten::funcs::ReluGradFunctor<float>>();
   BaseTest<float, paddle::operators::GeluFunctor<float>,
            paddle::operators::GeluGradFunctor<float>>();
 }
 TEST(FusedDropout, GPUFusedDropoutActBiasDouble) {
-  BaseTest<double, math::ReluFunctor<double>, math::ReluGradFunctor<double>>();
+  BaseTest<double, pten::funcs::ReluFunctor<double>,
+           pten::funcs::ReluGradFunctor<double>>();
   BaseTest<double, paddle::operators::GeluFunctor<double>,
            paddle::operators::GeluGradFunctor<double>>();
 }
@@ -296,15 +297,16 @@ TEST(FusedDropout, GPUFusedDropoutActBiasDouble) {
 // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
 TEST(FusedDropout, GPUFusedDropoutActBiasFp16) {
   using fp16 = platform::float16;
-  BaseTest<fp16, math::ReluFunctor<fp16>, math::ReluGradFunctor<fp16>>(true);
+  BaseTest<fp16, pten::funcs::ReluFunctor<fp16>,
+           pten::funcs::ReluGradFunctor<fp16>>(true);
 }
 
 TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) {
   const int rows = 16;
   const int cols = 16;
   for (auto is_upscale_in_train : {true, false}) {
-    TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                            math::ReluGradFunctor<float>>
+    TestFusedDropoutActBias<float, pten::funcs::ReluFunctor<float>,
+                            pten::funcs::ReluGradFunctor<float>>
         test(rows, cols, 0, 1.0, is_upscale_in_train, false);
     test.Run();
     test.CheckOut(static_cast<float>(1e-5));
@@ -315,8 +317,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) {
 TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, pten::funcs::ReluFunctor<float>,
+                          pten::funcs::ReluGradFunctor<float>>
       test(rows, cols, 0, 0.35, true, true);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
@@ -326,8 +328,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) {
 TEST(FusedDropout, GPUFusedDropoutActBiasSeed) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, pten::funcs::ReluFunctor<float>,
+                          pten::funcs::ReluGradFunctor<float>>
       test(rows, cols, 125, 0.0, false, false);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
@@ -337,8 +339,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasSeed) {
 TEST(FusedDropout, GPUFusedDropoutActBiasLargeShape) {
   const int rows = 256;
   const int cols = 4096;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, pten::funcs::ReluFunctor<float>,
+                          pten::funcs::ReluGradFunctor<float>>
       test(rows, cols);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index eb651e4ea7b4fc7bd156f0915edec87175d44047..b21a5fb8219ba1b0bf4a8d3e6bef6ecda6e9a653 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,12 +21,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 782c5d70ee07728b2a97730ef07f3e563b19ee4d..286f37f4496371501afe7296ef3aa4e492809ae8 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
-#include "paddle/fluid/operators/math/functors.h"
+#include "paddle/pten/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -167,8 +167,8 @@ class FusedDropoutHelper {
           dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train,
           dropout_param_.is_test, src, bias, out, mask, ctx);
     } else if (act_method == "relu") {
-      math::ReluFunctor<T> relu;
-      LaunchDropoutActBias<T, MaskType, math::ReluFunctor<T>>(
+      pten::funcs::ReluFunctor<T> relu;
+      LaunchDropoutActBias<T, MaskType, pten::funcs::ReluFunctor<T>>(
           relu, dropout_param_.seed, rows_, cols_, increment,
           dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train,
           dropout_param_.is_test, src, bias, out, mask, ctx);
@@ -187,8 +187,8 @@ class FusedDropoutHelper {
           gelu_grad, dout, mask, src, bias, dropout_param_.dropout_prob,
           dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx);
     } else if (act_method == "relu") {
-      math::ReluGradFunctor<T> relu_grad;
-      LaunchDropoutActBiasGrad<T, MaskType, math::ReluGradFunctor<T>>(
+      pten::funcs::ReluGradFunctor<T> relu_grad;
+      LaunchDropoutActBiasGrad<T, MaskType, pten::funcs::ReluGradFunctor<T>>(
           relu_grad, dout, mask, src, bias, dropout_param_.dropout_prob,
           dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx);
     } else {
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index b7dd89a8a28adffc09b75a1845a79fb66c0b67c8..792069652cde8cc1d67bfe8146cb58bbb9297106 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -19,8 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/compound_functors.h"
-#include "paddle/fluid/operators/math/functors.h"
+#include "paddle/pten/kernels/funcs/compound_functors.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -53,22 +54,22 @@ static void RunBinaryCompoundFunctor(
   // intermediate_out = Unary(Y)
   // out = Binary(X, Unary(Y))
   // In this case, the shape of intermediate_out and out are different.
-  paddle::operators::math::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
+  pten::funcs::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
       compound_func(binary_functor, unary_functor);
   int axis = ctx.Attr<int>("axis");
   if (ctx.Attr<bool>("save_intermediate_out")) {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::BinaryCompoundFunctor<
-                                     T, BinaryFunctor, UnaryFunctor>,
-                                 true /*KeepIntermediateValue*/,
-                                 false /*SameShapeOfIntermediateOutAndOut*/>(
+    FusedElemwiseAndActComputeEx<
+        DeviceContext, T,
+        pten::funcs::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>,
+        true /*KeepIntermediateValue*/,
+        false /*SameShapeOfIntermediateOutAndOut*/>(
         ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
   } else {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::BinaryCompoundFunctor<
-                                     T, BinaryFunctor, UnaryFunctor>,
-                                 false /*KeepIntermediateValue*/,
-                                 false /*SameShapeOfIntermediateOutAndOut*/>(
+    FusedElemwiseAndActComputeEx<
+        DeviceContext, T,
+        pten::funcs::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>,
+        false /*KeepIntermediateValue*/,
+        false /*SameShapeOfIntermediateOutAndOut*/>(
         ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
   }
 }
@@ -85,22 +86,22 @@ static void RunUnaryCompoundFunctors(
   // In this case, the shape of intermediate_out and out are the same.
   int axis = ctx.Attr<int>("axis");
 
-  paddle::operators::math::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
+  pten::funcs::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
       compound_func(unary_functor, binary_functor);
 
   if (ctx.Attr<bool>("save_intermediate_out")) {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::UnaryCompoundFunctor<
-                                     T, UnaryFunctor, BinaryFunctor>,
-                                 true /*KeepIntermediateValue*/,
-                                 true /*SameShapeOfIntermediateOutAndOut*/>(
+    FusedElemwiseAndActComputeEx<
+        DeviceContext, T,
+        pten::funcs::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>,
+        true /*KeepIntermediateValue*/,
+        true /*SameShapeOfIntermediateOutAndOut*/>(
         ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
   } else {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::UnaryCompoundFunctor<
-                                     T, UnaryFunctor, BinaryFunctor>,
-                                 false /*KeepIntermediateValue*/,
-                                 true /*SameShapeOfIntermediateOutAndOut*/>(
+    FusedElemwiseAndActComputeEx<
+        DeviceContext, T,
+        pten::funcs::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>,
+        false /*KeepIntermediateValue*/,
+        true /*SameShapeOfIntermediateOutAndOut*/>(
         ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
   }
 }
@@ -120,13 +121,12 @@ static void RunBinaryCompoundGradFunctors(
   int axis = ctx.Attr<int>("axis");
 
   using BinaryCompoundDxFunctor =
-      paddle::operators::math::BinaryCompoundGradDxFunctor<T, BinaryGradFunctor,
-                                                           UnaryFunctor>;
-  using BinaryCompoundDyFunctor =
-      paddle::operators::math::BinaryCompoundGradDyFunctor<
-          T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>;
+      pten::funcs::BinaryCompoundGradDxFunctor<T, BinaryGradFunctor,
+                                               UnaryFunctor>;
+  using BinaryCompoundDyFunctor = pten::funcs::BinaryCompoundGradDyFunctor<
+      T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>;
   using BinaryCompoundDIntermedaiteOutFunctor =
-      paddle::operators::math::BinaryCompoundGradDIntermedaiteOutFunctor<
+      pten::funcs::BinaryCompoundGradDIntermedaiteOutFunctor<
           T, BinaryGradFunctor, UnaryFunctor>;
 
   if (in_intermediate_out) {
@@ -170,14 +170,12 @@ static void RunUnaryCompoundGradFunctors(
   // Z = Unary(Binary(X, Y))
   int axis = ctx.Attr<int>("axis");
 
-  using UnaryCompoundDxFunctor =
-      paddle::operators::math::UnaryCompoundGradDxFunctor<
-          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
-  using UnaryCompoundDyFunctor =
-      paddle::operators::math::UnaryCompoundGradDyFunctor<
-          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
+  using UnaryCompoundDxFunctor = pten::funcs::UnaryCompoundGradDxFunctor<
+      T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
+  using UnaryCompoundDyFunctor = pten::funcs::UnaryCompoundGradDyFunctor<
+      T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
   using UnaryCompoundDIntermediateFunctor =
-      paddle::operators::math::UnaryCompoundGradDIntermediateFunctor<
+      pten::funcs::UnaryCompoundGradDIntermediateFunctor<
           T, UnaryGradFunctor, BinaryFunctor, InPlace>;
 
   if (in_intermediate_out) {
@@ -219,69 +217,60 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
   if (funcs_str == "elementwise_add,scale") {
     // Z = Binary(X, Unary(Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::AddFunctor<T>,
-                             paddle::operators::math::ScaleFunctor<T>>(
-        ctx, paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
+    RunBinaryCompoundFunctor<DeviceContext, T, pten::funcs::AddFunctor<T>,
+                             pten::funcs::ScaleFunctor<T>>(
+        ctx, pten::funcs::AddFunctor<T>(), pten::funcs::ScaleFunctor<T>(scale),
+        in_x, in_y, outputs);
   } else if (funcs_str == "scale,elementwise_add") {
     // Z = Unary(Binary(X, Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::ScaleFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+    RunUnaryCompoundFunctors<DeviceContext, T, pten::funcs::ScaleFunctor<T>,
+                             pten::funcs::AddFunctor<T>>(
+        ctx, pten::funcs::ScaleFunctor<T>(scale), pten::funcs::AddFunctor<T>(),
+        in_x, in_y, outputs);
   } else if (funcs_str == "elementwise_add,relu") {
     // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::AddFunctor<T>,
-                             paddle::operators::math::ReluFunctor<T>>(
-        ctx, paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::ReluFunctor<T>(), in_x, in_y, outputs);
+    RunBinaryCompoundFunctor<DeviceContext, T, pten::funcs::AddFunctor<T>,
+                             pten::funcs::ReluFunctor<T>>(
+        ctx, pten::funcs::AddFunctor<T>(), pten::funcs::ReluFunctor<T>(), in_x,
+        in_y, outputs);
   } else if (funcs_str == "relu,elementwise_add") {
     // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::ReluFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::ReluFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+    RunUnaryCompoundFunctors<DeviceContext, T, pten::funcs::ReluFunctor<T>,
+                             pten::funcs::AddFunctor<T>>(
+        ctx, pten::funcs::ReluFunctor<T>(), pten::funcs::AddFunctor<T>(), in_x,
+        in_y, outputs);
   } else if (funcs_str == "elementwise_mul,scale") {
     // Z = Binary(X, Unary(Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::ScaleFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
+    RunBinaryCompoundFunctor<DeviceContext, T, pten::funcs::MultiplyFunctor<T>,
+                             pten::funcs::ScaleFunctor<T>>(
+        ctx, pten::funcs::MultiplyFunctor<T>(),
+        pten::funcs::ScaleFunctor<T>(scale), in_x, in_y, outputs);
   } else if (funcs_str == "tanh,elementwise_add") {
     // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::TanhFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+    RunUnaryCompoundFunctors<DeviceContext, T, pten::funcs::TanhFunctor<T>,
+                             pten::funcs::AddFunctor<T>>(
+        ctx, pten::funcs::TanhFunctor<T>(), pten::funcs::AddFunctor<T>(), in_x,
+        in_y, outputs);
   } else if (funcs_str == "elementwise_mul,tanh") {
     // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::TanhFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(), in_x, in_y, outputs);
+    RunBinaryCompoundFunctor<DeviceContext, T, pten::funcs::MultiplyFunctor<T>,
+                             pten::funcs::TanhFunctor<T>>(
+        ctx, pten::funcs::MultiplyFunctor<T>(), pten::funcs::TanhFunctor<T>(),
+        in_x, in_y, outputs);
   } else if (funcs_str == "elementwise_mul,sigmoid") {
     // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::SigmoidFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(), in_x, in_y, outputs);
+    RunBinaryCompoundFunctor<DeviceContext, T, pten::funcs::MultiplyFunctor<T>,
+                             pten::funcs::SigmoidFunctor<T>>(
+        ctx, pten::funcs::MultiplyFunctor<T>(),
+        pten::funcs::SigmoidFunctor<T>(), in_x, in_y, outputs);
   } else if (funcs_str == "gelu,elementwise_add") {
     // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::GeluFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::GeluFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+    RunUnaryCompoundFunctors<DeviceContext, T, pten::funcs::GeluFunctor<T>,
+                             pten::funcs::AddFunctor<T>>(
+        ctx, pten::funcs::GeluFunctor<T>(), pten::funcs::AddFunctor<T>(), in_x,
+        in_y, outputs);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s has not been implemented.", funcs_str));
@@ -301,95 +290,83 @@ static void RunGradFunctors(
   if (funcs_str == "elementwise_add_grad,scale_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
-        paddle::operators::math::ScaleFunctor<T>,
-        paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::AddGradFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  pten::funcs::AddGradFunctor<T>,
+                                  pten::funcs::ScaleFunctor<T>,
+                                  pten::funcs::ScaleGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::AddGradFunctor<T>(),
+        pten::funcs::ScaleFunctor<T>(scale),
+        pten::funcs::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "scale_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
     RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::ScaleGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::ScaleGradFunctor<T>(scale),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
+        DeviceContext, T, pten::funcs::ScaleGradFunctor<T>,
+        pten::funcs::AddFunctor<T>, pten::funcs::AddGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::ScaleGradFunctor<T>(scale),
+        pten::funcs::AddFunctor<T>(), pten::funcs::AddGradFunctor<T>(), in_x,
+        in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad,
+        d_intermediate_out);
   } else if (funcs_str == "elementwise_add_grad,relu_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
-        paddle::operators::math::ReluFunctor<T>,
-        paddle::operators::math::ReluGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::AddGradFunctor<T>(),
-        paddle::operators::math::ReluFunctor<T>(),
-        paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out,
+        DeviceContext, T, pten::funcs::AddGradFunctor<T>,
+        pten::funcs::ReluFunctor<T>, pten::funcs::ReluGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::AddGradFunctor<T>(), pten::funcs::ReluFunctor<T>(),
+        pten::funcs::ReluGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "relu_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::ReluGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::ReluGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
+        DeviceContext, T, pten::funcs::ReluGradFunctor<T>,
+        pten::funcs::AddFunctor<T>, pten::funcs::AddGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::ReluGradFunctor<T>(), pten::funcs::AddFunctor<T>(),
+        pten::funcs::AddGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "elementwise_mul_grad,scale_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::ScaleFunctor<T>,
-        paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  pten::funcs::MulGradFunctor<T>,
+                                  pten::funcs::ScaleFunctor<T>,
+                                  pten::funcs::ScaleGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::MulGradFunctor<T>(),
+        pten::funcs::ScaleFunctor<T>(scale),
+        pten::funcs::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "tanh_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::TanhGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::TanhGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
+        DeviceContext, T, pten::funcs::TanhGradFunctor<T>,
+        pten::funcs::AddFunctor<T>, pten::funcs::AddGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::TanhGradFunctor<T>(), pten::funcs::AddFunctor<T>(),
+        pten::funcs::AddGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "elementwise_mul_grad,tanh_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::TanhFunctor<T>,
-        paddle::operators::math::TanhGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::TanhGradFunctor<T>(), in_x, in_y, in_out,
+        DeviceContext, T, pten::funcs::MulGradFunctor<T>,
+        pten::funcs::TanhFunctor<T>, pten::funcs::TanhGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::MulGradFunctor<T>(), pten::funcs::TanhFunctor<T>(),
+        pten::funcs::TanhGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "elementwise_mul_grad,sigmoid_grad") {
     // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::SigmoidFunctor<T>,
-        paddle::operators::math::SigmoidGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(),
-        paddle::operators::math::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  pten::funcs::MulGradFunctor<T>,
+                                  pten::funcs::SigmoidFunctor<T>,
+                                  pten::funcs::SigmoidGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::MulGradFunctor<T>(), pten::funcs::SigmoidFunctor<T>(),
+        pten::funcs::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "gelu_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::GeluGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::GeluGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
+        DeviceContext, T, pten::funcs::GeluGradFunctor<T>,
+        pten::funcs::AddFunctor<T>, pten::funcs::AddGradFunctor<T>, InPlace>(
+        ctx, pten::funcs::GeluGradFunctor<T>(), pten::funcs::AddFunctor<T>(),
+        pten::funcs::AddGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 911c2cda57504793059da160831411180bf6524e..ef61b78d6828170e0a6c0ce98fea4d7f467323f9 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -122,12 +122,12 @@ __global__ void FusedLayernormResidualDropoutBias(
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
 
-  math::ReluFunctor<T> relu;
+  pten::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
   U var_val = 0;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
     FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, true, false,
-                                      math::ReluFunctor<T>>(
+                                      pten::funcs::ReluFunctor<T>>(
         row_id, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
         mask, is_test, &mean_val, &var_val, relu);
   }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 2f5ec839fc2c73984cdec00f246c24d777321044..264e2e5f22d671318d0e73bed419717c1a024ced 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -115,12 +115,12 @@ __global__ void FusedResidualDropoutBias(
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
   const T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
-  math::ReluFunctor<T> relu;
+  pten::funcs::ReluFunctor<T> relu;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
       FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false, false,
-                                        math::ReluFunctor<T>>(
+                                        pten::funcs::ReluFunctor<T>>(
           r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
           mask, is_test, nullptr, nullptr, relu);
     }
diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc
index 32cc38ef1953364266181598f44ccd54e9dc631c..2df6b539ff68aa4934dc2562792a55a58b670417 100644
--- a/paddle/fluid/operators/histogram_op.cc
+++ b/paddle/fluid/operators/histogram_op.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/histogram_op.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -85,8 +85,3 @@ REGISTER_OPERATOR(
     histogram, ops::HistogramOp, ops::HistogramOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    histogram, ops::HistogramKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HistogramKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::HistogramKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::HistogramKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
deleted file mode 100644
index 48a637e6c37b1cf37e5653397ded01775eb54551..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/histogram_op.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/histogram_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/pten/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using IndexType = int64_t;
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-inline int GET_BLOCKS(const int N) {
-  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-}
-
-template <typename T, typename IndexType>
-__device__ static IndexType GetBin(T input_value, T min_value, T max_value,
-                                   int64_t nbins) {
-  IndexType bin = static_cast<int>((input_value - min_value) * nbins /
-                                   (max_value - min_value));
-  IndexType output_index = bin < nbins - 1 ? bin : nbins - 1;
-  return output_index;
-}
-
-template <typename T, typename IndexType>
-__global__ void KernelHistogram(const T* input, const int total_elements,
-                                const int64_t nbins, const T min_value,
-                                const T max_value, int64_t* output) {
-  extern __shared__ int64_t buf_hist[];
-  for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
-    buf_hist[i] = 0;
-  }
-  __syncthreads();
-
-  CUDA_KERNEL_LOOP(input_index, total_elements) {
-    // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x;
-    const auto input_value = input[input_index];
-    if (input_value >= min_value && input_value <= max_value) {
-      const IndexType output_index =
-          GetBin<T, IndexType>(input_value, min_value, max_value, nbins);
-      paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1);
-    }
-  }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
-    paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class HistogramCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    Tensor* output = context.Output<framework::Tensor>("Out");
-    auto& nbins = context.Attr<int64_t>("bins");
-    auto& minval = context.Attr<int>("min");
-    auto& maxval = context.Attr<int>("max");
-
-    const T* input_data = input->data<T>();
-    const int input_numel = input->numel();
-
-    int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    pten::funcs::SetConstant<platform::CUDADeviceContext, int64_t>()(
-        context.template device_context<platform::CUDADeviceContext>(), output,
-        static_cast<int64_t>(0));
-
-    if (input_data == nullptr) return;
-
-    T output_min = static_cast<T>(minval);
-    T output_max = static_cast<T>(maxval);
-
-    if (output_min == output_max) {
-      auto input_x = framework::EigenVector<T>::Flatten(*input);
-
-      framework::Tensor input_min_t, input_max_t;
-      auto* input_min_data =
-          input_min_t.mutable_data<T>({1}, context.GetPlace());
-      auto* input_max_data =
-          input_max_t.mutable_data<T>({1}, context.GetPlace());
-      auto input_min_scala = framework::EigenScalar<T>::From(input_min_t);
-      auto input_max_scala = framework::EigenScalar<T>::From(input_max_t);
-
-      auto* place =
-          context.template device_context<DeviceContext>().eigen_device();
-      input_min_scala.device(*place) = input_x.minimum();
-      input_max_scala.device(*place) = input_x.maximum();
-
-      Tensor input_min_cpu, input_max_cpu;
-      paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
-                                        &input_min_cpu);
-      paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
-                                        &input_max_cpu);
-
-      output_min = input_min_cpu.data<T>()[0];
-      output_max = input_max_cpu.data<T>()[0];
-    }
-    if (output_min == output_max) {
-      output_min = output_min - 1;
-      output_max = output_max + 1;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        (std::isinf(static_cast<float>(output_min)) ||
-         std::isnan(static_cast<float>(output_max)) ||
-         std::isinf(static_cast<float>(output_min)) ||
-         std::isnan(static_cast<float>(output_max))),
-        false, platform::errors::OutOfRange("range of min, max is not finite"));
-    PADDLE_ENFORCE_GE(
-        output_max, output_min,
-        platform::errors::InvalidArgument(
-            "max must be larger or equal to min. If min and max are both zero, "
-            "the minimum and maximum values of the data are used. "
-            "But received max is %d, min is %d",
-            maxval, minval));
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    KernelHistogram<
-        T, IndexType><<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS,
-                        nbins * sizeof(int64_t), stream>>>(
-        input_data, input_numel, nbins, output_min, output_max, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    histogram,
-    ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/histogram_op.h b/paddle/fluid/operators/histogram_op.h
deleted file mode 100644
index 9e280336e492af97d0107062f2d2a5ef22191133..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/histogram_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/pten/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class HistogramKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    Tensor* output = context.Output<framework::Tensor>("Out");
-    auto& nbins = context.Attr<int64_t>("bins");
-    auto& minval = context.Attr<int>("min");
-    auto& maxval = context.Attr<int>("max");
-
-    const T* input_data = input->data<T>();
-    auto input_numel = input->numel();
-
-    int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    pten::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output,
-        static_cast<int64_t>(0));
-
-    if (input_data == nullptr) return;
-
-    T output_min = static_cast<T>(minval);
-    T output_max = static_cast<T>(maxval);
-    if (output_min == output_max) {
-      output_min = *std::min_element(input_data, input_data + input_numel);
-      output_max = *std::max_element(input_data, input_data + input_numel);
-    }
-    if (output_min == output_max) {
-      output_min = output_min - 1;
-      output_max = output_max + 1;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        (std::isinf(static_cast<float>(output_min)) ||
-         std::isnan(static_cast<float>(output_max)) ||
-         std::isinf(static_cast<float>(output_min)) ||
-         std::isnan(static_cast<float>(output_max))),
-        false, platform::errors::OutOfRange("range of min, max is not finite"));
-    PADDLE_ENFORCE_GE(
-        output_max, output_min,
-        platform::errors::InvalidArgument(
-            "max must be larger or equal to min. If min and max are both zero, "
-            "the minimum and maximum values of the data are used. "
-            "But received max is %d, min is %d",
-            maxval, minval));
-
-    for (int64_t i = 0; i < input_numel; i++) {
-      if (input_data[i] >= output_min && input_data[i] <= output_max) {
-        const int64_t bin = (int64_t)((input_data[i] - output_min) * nbins /
-                                      (output_max - output_min));
-        out_data[std::min(bin, nbins - 1)] += 1;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/imag_op.h b/paddle/fluid/operators/imag_op.h
index 562a8dffa90623ed44c51ff1048c25550f5a7ce7..02682cfc954be57dd7900326dd98dae507fadeaa 100644
--- a/paddle/fluid/operators/imag_op.h
+++ b/paddle/fluid/operators/imag_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -31,12 +31,13 @@ class ImagKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<math::Real<T>>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(math::Real<T>)));
+    auto* out_data = out->mutable_data<pten::funcs::Real<T>>(
+        ctx.GetPlace(),
+        static_cast<size_t>(numel * sizeof(pten::funcs::Real<T>)));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::ImagFunctor<T> functor(x_data, out_data, numel);
+    pten::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
     for_range(functor);
   }
 };
@@ -51,13 +52,13 @@ class ImagGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<math::Real<T>>();
+    auto* dout_data = d_out->data<pten::funcs::Real<T>>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
+    pten::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
     for_range(functor);
   }
 };
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index b94182e9db73a5590ffa404508d2edda84983198..b5e2b0d776984327fa682efa2da9d961185c6433 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lerp_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -132,15 +132,3 @@ REGISTER_OPERATOR(
     paddle::operators::LerpInplaceInferer);
 
 REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    lerp,
-    paddle::operators::LerpKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::LerpKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    lerp_grad,
-    paddle::operators::LerpGradKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    paddle::operators::LerpGradKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/lerp_op.h b/paddle/fluid/operators/lerp_op.h
deleted file mode 100644
index 380a8ccffd8af97b1072d0fa2083e7a60980030d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lerp_op.h
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#ifdef _WIN32
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-#endif
-
-namespace paddle {
-namespace operators {
-
-static framework::DDim ExtendDims2Rank(const framework::DDim& in_dims,
-                                       int rank) {
-  if (in_dims.size() == rank) {
-    return in_dims;
-  }
-  std::vector<int64_t> shapes(rank, 1);
-  for (int i = in_dims.size() - 1, j = rank - 1; i >= 0; --i, --j) {
-    shapes[j] = in_dims[i];
-  }
-  return framework::make_ddim(shapes);
-}
-
-template <size_t D>
-static void GetBroadcastDims(const framework::DDim& in_dims,
-                             const framework::DDim& out_dims,
-                             Eigen::DSizes<int, D>* bcast_dims) {
-  for (size_t i = 0; i < D; ++i) {
-    if (in_dims[i] == out_dims[i]) {
-      (*bcast_dims)[i] = 1;
-    } else {
-      (*bcast_dims)[i] = std::max(in_dims[i], out_dims[i]);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, size_t D>
-static void LerpFunction(const framework::ExecutionContext& ctx) {
-  auto x = ctx.Input<framework::Tensor>("X");
-  auto y = ctx.Input<framework::Tensor>("Y");
-  auto w = ctx.Input<framework::Tensor>("Weight");
-  auto out = ctx.Output<framework::Tensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-
-  auto out_dims = out->dims();
-  auto x_dims = ExtendDims2Rank(x->dims(), D);
-  auto y_dims = ExtendDims2Rank(y->dims(), D);
-  auto w_dims = ExtendDims2Rank(w->dims(), D);
-  Eigen::DSizes<int, D> x_bcast_dims;
-  Eigen::DSizes<int, D> y_bcast_dims;
-  Eigen::DSizes<int, D> w_bcast_dims;
-  GetBroadcastDims<D>(x_dims, out_dims, &x_bcast_dims);
-  GetBroadcastDims<D>(y_dims, out_dims, &y_bcast_dims);
-  GetBroadcastDims<D>(w_dims, out_dims, &w_bcast_dims);
-
-  auto eigen_x = framework::EigenTensor<T, D>::From(*x, x_dims);
-  auto eigen_y = framework::EigenTensor<T, D>::From(*y, y_dims);
-  auto eigen_w = framework::EigenTensor<T, D>::From(*w, w_dims);
-  auto eigen_out = framework::EigenTensor<T, D>::From(*out);
-
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-  eigen_out.device(place) =
-      eigen_x.broadcast(x_bcast_dims) +
-      eigen_w.broadcast(w_bcast_dims) *
-          (eigen_y.broadcast(y_bcast_dims) - eigen_x.broadcast(x_bcast_dims));
-}
-
-template <typename DeviceContext, typename T, size_t D>
-static void LerpGradFunction(const framework::ExecutionContext& ctx) {
-  auto w = ctx.Input<framework::Tensor>("Weight");
-  auto dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-  auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-  auto dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-  auto dout_dims = dout->dims();
-  auto dx_dims = ExtendDims2Rank(dx->dims(), D);
-  auto dy_dims = ExtendDims2Rank(dy->dims(), D);
-  auto w_dims = ExtendDims2Rank(w->dims(), D);
-  Eigen::DSizes<int, D> dx_bcast_dims;
-  Eigen::DSizes<int, D> dy_bcast_dims;
-  Eigen::DSizes<int, D> w_bcast_dims;
-  GetBroadcastDims<D>(dx_dims, dout_dims, &dx_bcast_dims);
-  GetBroadcastDims<D>(dy_dims, dout_dims, &dy_bcast_dims);
-  GetBroadcastDims<D>(w_dims, dout_dims, &w_bcast_dims);
-
-  auto eigen_w = framework::EigenTensor<T, D>::From(*w, w_dims);
-  auto eigen_dout = framework::EigenTensor<T, D>::From(*dout);
-
-  Eigen::DSizes<int, D * 2> dx_reshape_dims;
-  Eigen::DSizes<int, D * 2> dy_reshape_dims;
-  Eigen::DSizes<int, D> reduce_dims;
-  for (int i = 0; i < dout_dims.size(); ++i) {
-    dx_reshape_dims[2 * i] = dx_bcast_dims[i];
-    dx_reshape_dims[2 * i + 1] = dx_dims[i];
-    dy_reshape_dims[2 * i] = dy_bcast_dims[i];
-    dy_reshape_dims[2 * i + 1] = dy_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-  if (dx) {
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto eigen_dx = framework::EigenTensor<T, D>::From(*dx, dx_dims);
-    auto eigen_expr = (1 - eigen_w.broadcast(w_bcast_dims)) * eigen_dout;
-    eigen_dx.device(place) = eigen_expr.reshape(dx_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(eigen_dx.dimensions());
-  }
-  if (dy) {
-    dy->mutable_data<T>(ctx.GetPlace());
-    auto eigen_dy = framework::EigenTensor<T, D>::From(*dy, dy_dims);
-    auto eigen_expr = eigen_w.broadcast(w_bcast_dims) * eigen_dout;
-    eigen_dy.device(place) = eigen_expr.reshape(dy_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(eigen_dy.dimensions());
-  }
-}
-
-template <typename DeviceContext, typename T>
-class LerpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Output<framework::Tensor>("Out")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions for LerpOp must be "
-            "greater than or equal to 1, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions for LerpOp must be "
-                     "less than or equal to 6, but the value received is %d.",
-                     rank));
-    switch (rank) {
-      case 1:
-        LerpFunction<DeviceContext, T, 1>(ctx);
-        break;
-      case 2:
-        LerpFunction<DeviceContext, T, 2>(ctx);
-        break;
-      case 3:
-        LerpFunction<DeviceContext, T, 3>(ctx);
-        break;
-      case 4:
-        LerpFunction<DeviceContext, T, 4>(ctx);
-        break;
-      case 5:
-        LerpFunction<DeviceContext, T, 5>(ctx);
-        break;
-      case 6:
-        LerpFunction<DeviceContext, T, 6>(ctx);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LerpGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<framework::Tensor>(framework::GradVarName("Out"))
-                   ->dims()
-                   .size();
-    PADDLE_ENFORCE_GE(
-        rank, 1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions for LerpGradOp must be "
-            "greater than or equal to 1, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions for LerpGradOp must be "
-                     "less than or equal to 6, but the value received is %d.",
-                     rank));
-    switch (rank) {
-      case 1:
-        LerpGradFunction<DeviceContext, T, 1>(ctx);
-        break;
-      case 2:
-        LerpGradFunction<DeviceContext, T, 2>(ctx);
-        break;
-      case 3:
-        LerpGradFunction<DeviceContext, T, 3>(ctx);
-        break;
-      case 4:
-        LerpGradFunction<DeviceContext, T, 4>(ctx);
-        break;
-      case 5:
-        LerpGradFunction<DeviceContext, T, 5>(ctx);
-        break;
-      case 6:
-        LerpGradFunction<DeviceContext, T, 6>(ctx);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 6676cde1cafcabfcaee325bafe3be3703fe1a0a2..c677b4978eb3e3c03a3ae42a434ff0df3d55fe83 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -15,8 +15,9 @@
 #include <limits>
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
-#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -213,15 +214,15 @@ __global__ void LogSoftmaxForwardCUDAKernelNotLastAxis(
       for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
         const AccT value =
             static_cast<AccT>(input[data_offset + d * dim_stride]);
-        max_value = math::MaxFunctor<AccT>()(max_value, value);
+        max_value = pten::funcs::MaxFunctor<AccT>()(max_value, value);
       }
       // If there are more than 1 threads along block x, reduce all max_values
       // and get the global max_value, which is the max value along "axis".
       // If there is only one thread along block x, no need to reduce, as the
       // 'max_value' is the global max_value.
       if (blockDim.x > 1) {
-        max_value =
-            BlockReduceAlongDimX<AccT, math::MaxFunctor>(sdata, max_value);
+        max_value = BlockReduceAlongDimX<AccT, pten::funcs::MaxFunctor>(
+            sdata, max_value);
       }
 
       // 2. reduce sum
@@ -232,7 +233,7 @@ __global__ void LogSoftmaxForwardCUDAKernelNotLastAxis(
                         max_value);
       }
       if (blockDim.x > 1) {
-        sum = BlockReduceAlongDimX<AccT, math::AddFunctor>(sdata, sum);
+        sum = BlockReduceAlongDimX<AccT, pten::funcs::AddFunctor>(sdata, sum);
       }
 
       // 3. input-max-log_sum and write to output
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 4819bd725183248d8711b94c546e11e6d30026ab..f39d65d681f2f8f0e18c2fae13154d76b8b2f76c 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -18,7 +18,6 @@
 #include <algorithm>
 #include <complex>
 #include "paddle/fluid/operators/eig_op.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
@@ -26,6 +25,7 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 
 #define EPSILON 1e-6
@@ -46,7 +46,7 @@ template <typename DeviceContext, typename T>
 class LstsqCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using ValueType = math::Real<T>;
+    using ValueType = pten::funcs::Real<T>;
 
     const Tensor& x = *context.Input<Tensor>("X");
     auto y = context.Input<Tensor>("Y");
@@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
                         &rwkopt, &info);
     }
 
-    lwork = std::max<int>(1, static_cast<int>(math::Real<T>(wkopt)));
+    lwork = std::max<int>(1, static_cast<int>(pten::funcs::Real<T>(wkopt)));
     Tensor work;
     work.Resize(framework::make_ddim({lwork}));
     T* work_data = work.mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 11174540cb0cd1f6e971c9fb85338b2eeb8bbfa0..0d05d766e67fb16c75d5fb0f9c798c7048a1c7f9 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -211,8 +211,9 @@ void Tensor_Conj(const DeviceContext& dev_ctx, const framework::Tensor& tensor,
                  framework::Tensor* out) {
   out->Resize(tensor.dims());
   platform::ForRange<DeviceContext> out_for_range(dev_ctx, tensor.numel());
-  math::ConjFunctor<T> out_functor(tensor.data<T>(), tensor.numel(),
-                                   out->mutable_data<T>(dev_ctx.GetPlace()));
+  pten::funcs::ConjFunctor<T> out_functor(
+      tensor.data<T>(), tensor.numel(),
+      out->mutable_data<T>(dev_ctx.GetPlace()));
   out_for_range(out_functor);
 }
 
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index f9a4e963c0c478e2d4e4bb35b2ddf63e0ac7e8b8..0e6b63be90ef695801c8dc820985d3562ab429ae 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -813,6 +813,102 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta,
+    platform::bfloat16 *C) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A,
+        CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A,
+        CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -1208,6 +1304,42 @@ inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::bfloat16 alpha,
+    const platform::bfloat16 *A, const platform::bfloat16 *B,
+    platform::bfloat16 beta, platform::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -1306,6 +1438,91 @@ void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc,
+        strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
+      "11"));
+#endif  // CUDA_VERSION >= 11000
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc,
+        strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
+      "11"));
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -1356,6 +1573,32 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 980caa9cfe68c64a1afd21a82d366b5228f8f026..9518da89edeb01a1dc35c2a6544ff2e55297a697 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -550,6 +550,84 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta,
+    platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  // TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+        rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta,
+        C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N,
+        rocblas_datatype_f32_r, algo, 0, 0));
+  });
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  // TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+        rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta,
+        C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N,
+        rocblas_datatype_f32_r, algo, 0, 0));
+  });
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -874,6 +952,39 @@ inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::bfloat16 alpha,
+    const platform::bfloat16 *A, const platform::bfloat16 *B,
+    platform::bfloat16 beta, platform::bfloat16 *C) const {
+  // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -898,6 +1009,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                   ldc, strideC, batchCount);
   });
 }
+
 template <>
 template <typename T>
 void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -925,6 +1037,70 @@ void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  const int64_t strideC = M * N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::rocblas_gemm_strided_batched_ex(
+            handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+            rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C,
+            rocblas_datatype_bf16_r, ldc, strideC, batchCount,
+            rocblas_datatype_f32_r, algo, 0, 0));
+  });
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  const int64_t strideC = M * N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::rocblas_gemm_strided_batched_ex(
+            handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+            rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C,
+            rocblas_datatype_bf16_r, ldc, strideC, batchCount,
+            rocblas_datatype_f32_r, algo, 0, 0));
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -935,6 +1111,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                            C[k]);
   }
 }
+
 template <>
 template <typename T>
 void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -973,6 +1150,32 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 9ce615c949ffcb0e7ef300dfdc4f45b87604ad0c..b946d4d072ba2e276df632e5fea6960fbbe17975 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -63,7 +63,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = math::Real<T>;
+    using ValueType = pten::funcs::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto dito =
@@ -123,9 +123,9 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
     for (auto i = 0; i < batch_size; i++) {
       auto *value_data = out_value + i * values_stride;
       auto *input_data = input_vector + i * vector_stride;
-      math::lapackEigh<T, Real<T>>(jobz, uplo, n, input_data, lda, value_data,
-                                   work_data, lwork, rwork_data, lrwork,
-                                   iwork_data, liwork, &info);
+      math::lapackEigh<T, pten::funcs::Real<T>>(
+          jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
+          rwork_data, lrwork, iwork_data, liwork, &info);
       CheckEighResult(i, info);
     }
     if (has_vectors) {
@@ -151,7 +151,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = math::Real<T>;
+    using ValueType = pten::funcs::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -233,7 +233,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
   }
 
-  using ValueType = math::Real<T>;
+  using ValueType = pten::funcs::Real<T>;
   inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                         cublasFillMode_t uplo, int n, const T *A, int lda,
                         const ValueType *W, int *lwork) const;
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 5fdc2889a88858769c4bdf445367dc60265d6cbf..1c750fcb832c1ca0fae51c6c5f818fe82923897e 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -26,9 +26,9 @@ namespace cub = hipcub;
 #include <thrust/iterator/reverse_iterator.h>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
                                                        size_t num_rows,
                                                        size_t row_size, T init,
                                                        BinaryOp op) {
-  using RealT = math::Real<T>;
+  using RealT = pten::funcs::Real<T>;
   constexpr auto kSharedBufferSize =
       framework::IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
   __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..506b57186965de8fff758a958cc0e87b374e64bc
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int16_t>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<double>>;
+
+template struct SetConstant<pten::CPUContext, platform::float16>;
+template struct SetConstant<pten::CPUContext, platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, platform::complex<float>>;
+template struct SetConstant<pten::CPUContext, platform::complex<double>>;
+
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::XPUDeviceContext, float>;
+template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::XPUDeviceContext, int16_t>;
+template struct SetConstant<platform::XPUDeviceContext, int>;
+template struct SetConstant<platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<platform::XPUDeviceContext, bool>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<double>>;
+#endif
+
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<float>, RANK>;                \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<double>, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    transpose_helper(0, out->numel());
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+template <>
+void set_constant_with_place<platform::MLUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CustomPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CUDAPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
+
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
+  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index eaec4b78f4fc0401c907fe0481d9b9e1da1b8ff4..40f2b625f65006061f24779c0aee2b92ec297890 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -527,7 +527,7 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
                   ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
 
 DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
-                            PT_INFER_META(pten::MatmulGradInferMeta));
+                            PT_INFER_META(pten::GeneralBinaryGradInferMeta));
 REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 0e1c6b82e41922cb1a7fd8404ffae1135e7872a0..6fac2d1038334528b87c056ae0d14a366432d5bc 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/dot_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index d974d7c1b78f15bb5e0f050b4e415af453e4349f..2df794fb794430910e71c0980154f682c3f4920d 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/matrix_rank_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -93,8 +93,8 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
                    info_ptr);
       platform::ForRange<platform::CUDADeviceContext> for_range(
           dev_ctx, eigenvalue_tensor.numel());
-      math::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
-                                  eigenvalue_tensor.numel());
+      pten::funcs::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
+                                         eigenvalue_tensor.numel());
       for_range(functor);
     } else {
       Tensor U, VH;
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 8efd2b226cad6f27c12036d863dba5a60ebf586f..2c84218c48e0bcc2d22d032bf5b3e949424aec3a 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -71,9 +71,6 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext& dev_ctx,
   auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                   reorder_dst_memory_p);
 
-  paddle::platform::RecordEvent record_reorder(
-      "int_reorder", paddle::platform::EventRole::kUniqueOp);
-
   auto& astream = MKLDNNDeviceContext::tls().get_stream();
   reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
   astream.wait();
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index c44f22dd02face48fe344ea2ee91ead4e9836837..deb8c735c8b0260d80c016439b8f1ae6765b56c5 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -106,12 +106,8 @@ class QuantOpKernel : public framework::OpKernel<T> {
     reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    {
-      platform::RecordEvent record_reorder("int_reorder",
-                                           platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *src_memory, *dst_memory);
-      astream.wait();
-    }
+    reorder_p->execute(astream, *src_memory, *dst_memory);
+    astream.wait();
 
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(GetMKLDNNFormat(*dst_memory));
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 1b1bd69aec2f4d88a65d66fd9a59d9ea9c78ee66..963f10441f9bdf2ac3369a770c43ca92ac21a7bf 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -138,12 +138,9 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
     }
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    {
-      platform::RecordEvent record_reorder("int_reorder",
-                                           platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *src_memory, *dst_memory);
-      astream.wait();
-    }
+
+    reorder_p->execute(astream, *src_memory, *dst_memory);
+    astream.wait();
 
     output->set_layout(framework::DataLayout::kMKLDNN);
     output->set_format(platform::GetMKLDNNFormat(*dst_memory));
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 5df2a546812adec055573a0d9c2c5c373fbed928..9c63afff13c22c7ad4ec283f2b25c2bc4535e6d1 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -174,12 +174,9 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           output, in_out.format(), ctx.GetPlace());
 
       auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem);
-      {
-        platform::RecordEvent record_reorder("int_reorder",
-                                             platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *dst_mem, *target_mem);
-        astream.wait();
-      }
+
+      reorder_p->execute(astream, *dst_mem, *target_mem);
+      astream.wait();
     }
     output->set_layout(framework::DataLayout::kMKLDNN);
     output->set_format(platform::GetMKLDNNFormat(*dst_mem));
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 7251653793f89900efa5382db74201a1fc232574..7bd2eb5c5eba6733c2c52f745b28fa4230d12b64 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -25,7 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 0612417c46ce30a73ce0cbc582be740023ff0ab6..6be0e703e564ceb397ea90c810f4018388b2838e 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -25,7 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index b1001b4e5684be02df4784711ad459cd2005affb..82ea75943dee41c1c52b2f6e6f1bb9a71fa4a8f3 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/operator.h"
@@ -1150,6 +1151,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       output_desc, output, workspace_ptr, workspace_size));
 }
 
+/* static */ void MLUCnnl::AdaptivePoolingForward(
+    const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output,
+    const cnnlTensorDescriptor_t index_desc, void* index) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlAdaptivePoolingForward(handle, input_desc, input, pool_mode,
+                                 output_desc, output, index_desc, index));
+}
+
 /* static */ void MLUCnnl::Pool3D(
     const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
     const std::vector<int64_t>& output_shape,
@@ -1801,6 +1814,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x));
 }
 
+/* static */ void MLUCnnl::AdaptivePoolingBackward(
+    const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode,
+    const cnnlTensorDescriptor_t y_desc, const void* y,
+    const cnnlTensorDescriptor_t index_desc, const void* index,
+    const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAdaptivePoolingBackward(
+      handle, y_desc, y, index_desc, index, pool_mode, diff_x_desc, diff_x));
+}
+
 /* static */ void MLUCnnl::NonMaxSuppression(
     const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc,
     const cnnlTensorDescriptor_t boxes_desc, const void* boxes,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index ad912c034683f491ef782e1494a96e2442865385..91eddaf792e8aed0097aecba2c8295ed65262b50 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -649,6 +649,12 @@ class MLUCnnl {
       const void* input, const void* beta, const void* extra_input_ptr,
       const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void AdaptivePoolingForward(
+      const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const cnnlTensorDescriptor_t output_desc, void* output,
+      const cnnlTensorDescriptor_t index_desc, void* index);
+
   static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
                      const std::vector<int64_t>& output_shape,
                      cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
@@ -958,6 +964,12 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta,
       const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
 
+  static void AdaptivePoolingBackward(
+      const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode,
+      const cnnlTensorDescriptor_t y_desc, const void* y,
+      const cnnlTensorDescriptor_t index_desc, const void* index,
+      const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
+
   static void PoolingIndex(const ExecutionContext& ctx,
                            const cnnlPoolingDescriptor_t pooling_desc,
                            const cnnlTensorDescriptor_t x_desc, const void* x,
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index 7c1cf9109c566625743f69de8cf3213855600c69..b96fcaa486cce8099cf1d03c7d948ea74c1923ad 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -18,7 +18,7 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index a64a9c274ed7dcf96abc43e34e9a21a4dbe7a6be..1bbd671323e6d9b189844556d1071d55e7fba57c 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -21,12 +21,12 @@ namespace operators {
 namespace {
 
 cnnlPoolingMode_t ToCnnlPoolingMode(const std::string &pooling_type,
-                                    bool exclusive) {
+                                    bool exclusive, bool adaptive) {
   cnnlPoolingMode_t pooling_mode;
   if (pooling_type == "max") {
     pooling_mode = CNNL_POOLING_MAX;
   } else if (pooling_type == "avg") {
-    if (exclusive) {
+    if (exclusive && !adaptive) {
       pooling_mode = CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
     } else {
       pooling_mode = CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
@@ -64,10 +64,7 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "Only support 4-dims for mlu pool2d kernel."));
 
-    PADDLE_ENFORCE_EQ(adaptive, false,
-                      platform::errors::InvalidArgument(
-                          "Not support adaptive for mlu pool2d kernel."));
-
+    const bool channel_last = data_format == "NHWC";
     // default
     cnnlTensorLayout_t cnnl_layout = CNNL_LAYOUT_NCHW;
     auto out_dims = out->dims();
@@ -77,7 +74,6 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
     framework::DDim data_dims =
         framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
 
-    const bool channel_last = data_format == "NHWC";
     if (channel_last) {
       cnnl_layout = CNNL_LAYOUT_NHWC;
       out_h = out_dims[1];
@@ -94,42 +90,74 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
     MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
     MLUCnnlTensorDesc out_desc(*out, cnnl_layout, ToCnnlDataType<T>());
 
-    cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive);
-    MLUCnnlPoolingDesc pool_desc(
-        pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0],
-        paddings[1], paddings[2], paddings[3], strides[0], strides[1],
-        1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode);
+    cnnlPoolingMode_t pool_mode =
+        ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
+
+    if (!adaptive) {
+      MLUCnnlPoolingDesc pool_desc(
+          pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0],
+          paddings[1], paddings[2], paddings[3], strides[0], strides[1],
+          1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode);
+
+      size_t extra_input_size = 0;
+      cnnlHandle_t handle =
+          ctx.template device_context<MLUDeviceContext>().cnnl_handle();
+      cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h,
+                                   &extra_input_size);
 
-    size_t extra_input_size = 0;
-    cnnlHandle_t handle =
-        ctx.template device_context<MLUDeviceContext>().cnnl_handle();
-    cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h,
-                                 &extra_input_size);
-
-    if (extra_input_size > 0) {
-      paddle::platform::CPUDeviceContext cpu_ctx;
-      framework::Tensor extra_host_tensor =
-          ctx.AllocateTmpTensor<int8_t, platform::CPUDeviceContext>(
-              {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
-      cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(),
-                                out_desc.get(), GetBasePtr(&extra_host_tensor));
-      framework::Tensor extra_device_tensor =
-          ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
-              {static_cast<int64_t>(extra_input_size)}, dev_ctx);
-      // TODO(fwg): use Async copy, and add a callback to stream that free host
-      // memory.
-      framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
-                                &extra_device_tensor);
-      MLUCnnl::PoolingForward(
-          ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
-          in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
-          GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, out_desc.get(),
-          GetBasePtr(out));
+      if (extra_input_size > 0) {
+        paddle::platform::CPUDeviceContext cpu_ctx;
+        framework::Tensor extra_host_tensor =
+            ctx.AllocateTmpTensor<int8_t, platform::CPUDeviceContext>(
+                {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
+        cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(),
+                                  out_desc.get(),
+                                  GetBasePtr(&extra_host_tensor));
+        framework::Tensor extra_device_tensor =
+            ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+                {static_cast<int64_t>(extra_input_size)}, dev_ctx);
+        // TODO(fwg): use Async copy, and add a callback to stream that free
+        // host
+        // memory.
+        framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
+                                  &extra_device_tensor);
+        MLUCnnl::PoolingForward(
+            ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
+            in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
+            GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/,
+            out_desc.get(), GetBasePtr(out));
+      } else {
+        MLUCnnl::PoolingForward(
+            ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
+            in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
+            nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out));
+      }
     } else {
-      MLUCnnl::PoolingForward(
-          ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
-          in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
-          nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out));
+      // cnnl Adaptive pooling only support NHWC layout
+      framework::Tensor trans_in_x;
+      framework::Tensor trans_out;
+      if (channel_last) {
+        trans_in_x = *in_x;
+        trans_out = *out;
+      } else {
+        std::vector<int> perm{0, 2, 3, 1};
+        TransposeFromMLUTensor<T>(ctx, perm, in_x, &trans_in_x,
+                                  true /*need_reshape_or_alloc*/);
+        trans_out = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+            {out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx);
+      }
+      MLUCnnlTensorDesc trans_in_x_desc(trans_in_x, CNNL_LAYOUT_NHWC,
+                                        ToCnnlDataType<T>());
+      MLUCnnlTensorDesc trans_out_desc(trans_out, CNNL_LAYOUT_NHWC,
+                                       ToCnnlDataType<T>());
+      MLUCnnl::AdaptivePoolingForward(
+          ctx, pool_mode, trans_in_x_desc.get(), GetBasePtr(&trans_in_x),
+          trans_out_desc.get(), GetBasePtr(&trans_out), nullptr, nullptr);
+      if (!channel_last) {
+        std::vector<int> perm{0, 3, 1, 2};
+        TransposeFromMLUTensor<T>(ctx, perm, &trans_out, out,
+                                  false /*need_reshape_or_alloc*/);
+      }
     }
   }
 };
@@ -204,7 +232,8 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
     MLUCnnlTensorDesc trans_in_x_grad_desc(trans_in_x_grad, CNNL_LAYOUT_NHWC,
                                            ToCnnlDataType<T>());
 
-    cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive);
+    cnnlPoolingMode_t pool_mode =
+        ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
     MLUCnnlPoolingDesc pool_desc(
         pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0],
         paddings[1], paddings[2], paddings[3], strides[0], strides[1],
@@ -219,18 +248,34 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
       MLUCnnl::PoolingIndex(ctx, pool_desc.get(), trans_in_x_desc.get(),
                             GetBasePtr(&trans_in_x), index_tensor_desc.get(),
                             GetBasePtr(&index_tensor));
-      MLUCnnl::PoolingBackward(
-          ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(),
-          GetBasePtr(&index_tensor), trans_out_grad_desc.get(),
-          GetBasePtr(&trans_out_grad), trans_in_x_desc.get(),
-          GetBasePtr(&trans_in_x), nullptr /*beta*/, trans_in_x_grad_desc.get(),
-          GetBasePtr(&trans_in_x_grad));
+      if (adaptive) {
+        MLUCnnl::AdaptivePoolingBackward(
+            ctx, pool_mode, trans_out_grad_desc.get(),
+            GetBasePtr(&trans_out_grad), index_tensor_desc.get(),
+            GetBasePtr(&index_tensor), trans_in_x_grad_desc.get(),
+            GetBasePtr(&trans_in_x_grad));
+      } else {
+        MLUCnnl::PoolingBackward(
+            ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(),
+            GetBasePtr(&index_tensor), trans_out_grad_desc.get(),
+            GetBasePtr(&trans_out_grad), trans_in_x_desc.get(),
+            GetBasePtr(&trans_in_x), nullptr /*beta*/,
+            trans_in_x_grad_desc.get(), GetBasePtr(&trans_in_x_grad));
+      }
     } else {
-      MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/, nullptr,
-                               nullptr, trans_out_grad_desc.get(),
-                               GetBasePtr(&trans_out_grad), nullptr, nullptr,
-                               nullptr /*beta*/, trans_in_x_grad_desc.get(),
-                               GetBasePtr(&trans_in_x_grad));
+      if (adaptive) {
+        MLUCnnl::AdaptivePoolingBackward(
+            ctx, pool_mode, trans_out_grad_desc.get(),
+            GetBasePtr(&trans_out_grad), nullptr /*index_tensor_desc.get()*/,
+            nullptr /*GetBasePtr(&index_tensor)*/, trans_in_x_grad_desc.get(),
+            GetBasePtr(&trans_in_x_grad));
+      } else {
+        MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/,
+                                 nullptr, nullptr, trans_out_grad_desc.get(),
+                                 GetBasePtr(&trans_out_grad), nullptr, nullptr,
+                                 nullptr /*beta*/, trans_in_x_grad_desc.get(),
+                                 GetBasePtr(&trans_in_x_grad));
+      }
     }
     if (!channel_last) {
       std::vector<int> perm{0, 3, 1, 2};
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index c8b6404830cdac0427f99b2e1d2c642fe8aa0f38..dfeec15d9b887aa55b81004b728a7c31fc8b4be7 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -56,12 +56,13 @@ class QrGPUKernel : public framework::OpKernel<T> {
     int tau_stride = min_mn;
 
     if (compute_q) {
-      q.mutable_data<math::Real<T>>(
+      q.mutable_data<pten::funcs::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(math::Real<T>)));
+          size_t(batch_size * m * k * sizeof(pten::funcs::Real<T>)));
     }
-    r.mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
+    r.mutable_data<pten::funcs::Real<T>>(
+        context.GetPlace(),
+        size_t(batch_size * k * n * sizeof(pten::funcs::Real<T>)));
 
     auto dito =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
@@ -70,8 +71,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     // Note: allocate temporary tensors because of lacking in-place operatios.
     // Prepare qr
     Tensor qr;
-    qr.mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real<T>)));
+    qr.mutable_data<pten::funcs::Real<T>>(
+        context.GetPlace(),
+        size_t(batch_size * m * n * sizeof(pten::funcs::Real<T>)));
     // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
     // input
     paddle::framework::TensorCopy(x, context.GetPlace(), &qr);
@@ -124,7 +126,8 @@ class QrGPUKernel : public framework::OpKernel<T> {
           for (int i = 0; i < batch_size; ++i) {
             memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride),
                          dev_ctx.GetPlace(), (qr_data + i * qr_stride),
-                         qr_stride * sizeof(math::Real<T>), dev_ctx.stream());
+                         qr_stride * sizeof(pten::funcs::Real<T>),
+                         dev_ctx.stream());
           }
           BatchedOrgqr<platform::CUDADeviceContext, T>(
               dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, tau_data,
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index c55619a4f76e7f316c6c7bcb689e2a101e5908eb..b8308b29106be39bacbf05028809e7206ea63cec 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -18,9 +18,9 @@
 #include <cstdarg>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -74,17 +74,20 @@ class QrCPUKernel : public framework::OpKernel<T> {
     int q_stride = m * k;
     int r_stride = k * n;
 
-    auto* x_data = x.data<math::Real<T>>();
+    auto* x_data = x.data<pten::funcs::Real<T>>();
     T* q_data = nullptr;
     if (compute_q) {
-      q_data = q.mutable_data<math::Real<T>>(
+      q_data = q.mutable_data<pten::funcs::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(math::Real<T>)));
-      memset(q_data, 0, size_t(batch_size * m * k * sizeof(math::Real<T>)));
+          size_t(batch_size * m * k * sizeof(pten::funcs::Real<T>)));
+      memset(q_data, 0,
+             size_t(batch_size * m * k * sizeof(pten::funcs::Real<T>)));
     }
-    auto* r_data = r.mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(math::Real<T>)));
+    auto* r_data = r.mutable_data<pten::funcs::Real<T>>(
+        context.GetPlace(),
+        size_t(batch_size * k * n * sizeof(pten::funcs::Real<T>)));
+    memset(r_data, 0,
+           size_t(batch_size * k * n * sizeof(pten::funcs::Real<T>)));
 
     // Implement QR by calling Eigen
     for (int i = 0; i < batch_size; ++i) {
@@ -140,7 +143,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     // Use a different name dA instead of dX
     framework::Tensor& dA =
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dA.mutable_data<math::Real<T>>(ctx.GetPlace());
+    dA.mutable_data<pten::funcs::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     pten::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
@@ -222,7 +225,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     } else {
       // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V]
       // Calculate dX and dY individually and concatenate them to get dA
-      dA.mutable_data<math::Real<T>>(ctx.GetPlace());
+      dA.mutable_data<pten::funcs::Real<T>>(ctx.GetPlace());
 
       auto Y = dito.Slice(A, {-1}, {m}, {n});
       auto U = dito.Slice(R, {-1}, {0}, {m});
diff --git a/paddle/fluid/operators/real_op.h b/paddle/fluid/operators/real_op.h
index 6cc9065269c62716b54c329d46711ff96f83f015..41549393f578ff6109b629a6036cbbef108b398c 100644
--- a/paddle/fluid/operators/real_op.h
+++ b/paddle/fluid/operators/real_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -31,12 +31,13 @@ class RealKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<math::Real<T>>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(math::Real<T>)));
+    auto* out_data = out->mutable_data<pten::funcs::Real<T>>(
+        ctx.GetPlace(),
+        static_cast<size_t>(numel * sizeof(pten::funcs::Real<T>)));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::RealFunctor<T> functor(x_data, out_data, numel);
+    pten::funcs::RealFunctor<T> functor(x_data, out_data, numel);
     for_range(functor);
   }
 };
@@ -51,13 +52,13 @@ class RealGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<math::Real<T>>();
+    auto* dout_data = d_out->data<pten::funcs::Real<T>>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    math::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
+    pten::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
     for_range(functor);
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 5dcb81c75407f7c4b4a2f787d04e3085f366b348..a27b6ae90f29a14af5e0a119fb3f5d0182dafa7c 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -85,9 +85,6 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
       auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                       reorder_dst_memory_p);
 
-      platform::RecordEvent record_reorder("int_reorder",
-                                           platform::EventRole::kUniqueOp);
-
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
diff --git a/paddle/fluid/operators/renorm_op.h b/paddle/fluid/operators/renorm_op.h
index 461f383ad25639fe2db9b64eb490ad1e7a769a4a..753ed9e27ac0918b7f36cd347b190b80714ccde5 100644
--- a/paddle/fluid/operators/renorm_op.h
+++ b/paddle/fluid/operators/renorm_op.h
@@ -17,8 +17,8 @@
 #include "math.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 77c4a2005e3bf71c46b24e75d8c929507d2ca8a0..74095d2ce4e657f247f49818d9280295c68d5247 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -657,30 +657,6 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp,
                   ops::ReshapeDoubleGradInplaceInferer,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer);
 
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t,
-    ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel,
-    int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel,
-    paddle::platform::bfloat16, ops::ReshapeKernel,
-    paddle::platform::complex<float>, ops::ReshapeKernel,
-    paddle::platform::complex<double>, ops::ReshapeKernel);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2_grad, float, ops::ReshapeGradKernel, double,
-    ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
-    ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool,
-    ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel,
-    paddle::platform::complex<float>, ops::ReshapeGradKernel,
-    paddle::platform::complex<double>, ops::ReshapeGradKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
-    ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
-    ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool,
-    ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex<float>,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex<double>,
-    ops::ReshapeDoubleGradKernel);
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -695,45 +671,4 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 ops::ReshapeGradKernel, plat::float16,
                                 ops::ReshapeGradKernel, plat::bfloat16,
                                 ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                uint8_t, ops::ReshapeKernel, int64_t,
-                                ops::ReshapeKernel, plat::float16,
-                                ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                                plat::complex<float>, ops::ReshapeKernel,
-                                plat::complex<double>, ops::ReshapeKernel,
-                                plat::bfloat16, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(
-    reshape2_grad, float, ops::ReshapeGradKernel, double,
-    ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
-    ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
-    ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
-    ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
-    plat::bfloat16, ops::ReshapeGradKernel);
-
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(
-    reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
-    ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
-    ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel,
-    plat::float16, ops::ReshapeDoubleGradKernel, bool,
-    ops::ReshapeDoubleGradKernel, plat::complex<float>,
-    ops::ReshapeDoubleGradKernel, plat::complex<double>,
-    ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel, plat::float16,
-                               ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                               plat::complex<float>, ops::ReshapeKernel,
-                               plat::complex<double>, ops::ReshapeKernel);
-REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel, plat::float16,
-                               ops::ReshapeGradKernel, bool,
-                               ops::ReshapeGradKernel, plat::complex<float>,
-                               ops::ReshapeGradKernel, plat::complex<double>,
-                               ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index c932834db39b30c50746aeee80fcd32b5090f58f..77703637db5cd7d34c865083bd765e1122b7fefb 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -20,11 +20,11 @@
 #include <vector>
 
 #include "paddle/fluid/operators/conj_op.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/spectral_helper.h"
 #include "paddle/fluid/operators/spectral_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -115,8 +115,8 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
     framework::Tensor input_conj(input->type());
     input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
     platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                  input_conj.data<Ti>());
+    pten::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                         input_conj.data<Ti>());
     for_range(functor);
     exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
@@ -126,8 +126,8 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
     exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                  output->data<To>());
+    pten::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                         output->data<To>());
     for_range(functor);
   } else {
     exec_cufft_plan_raw(config, input->data(), output->data(), forward);
@@ -227,8 +227,8 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
     framework::Tensor input_conj(input->type());
     input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
     platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                  input_conj.data<Ti>());
+    pten::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                         input_conj.data<Ti>());
     for_range(functor);
     exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
@@ -238,8 +238,8 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
     exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                  output->data<To>());
+    pten::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                         output->data<To>());
     for_range(functor);
   } else {
     exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 3a57a7b3e54cc3313654d20256b888efdb4baf5a..4384e7152fa4e56554a3effd7e82b56b03a1c585 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -25,9 +25,9 @@
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -105,7 +105,8 @@ struct RealMulComplexFunctor {
                                         "The image part of y must to be 0"
                                         "but got [%d]",
                                         y.imag));
-    return platform::complex<Real<T>>(x.real * y.real, x.imag * y.real);
+    return platform::complex<pten::funcs::Real<T>>(x.real * y.real,
+                                                   x.imag * y.real);
   }
 };
 
@@ -390,11 +391,11 @@ struct DeviceIndependenceTensorOperations {
   // batch_diag for CPU only
   Tensor BatchDiag(const Tensor& x, int batch) {
     Tensor out;
-    auto* x_data = x.data<math::Real<T>>();
+    auto* x_data = x.data<pten::funcs::Real<T>>();
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<math::Real<T>>(
+    auto* out_data = out.mutable_data<pten::funcs::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(math::Real<T>)));
+        static_cast<size_t>(numel * sizeof(pten::funcs::Real<T>)));
 
     auto x_dims = x.dims();
     int num_dims = x_dims.size();
@@ -654,7 +655,7 @@ struct DeviceIndependenceTensorOperations {
     auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(x.numel());
-    math::ConjFunctor<T> functor(x_data, x.numel(), out_data);
+    pten::funcs::ConjFunctor<T> functor(x_data, x.numel(), out_data);
     for_range(functor);
     return out;
   }
@@ -662,12 +663,12 @@ struct DeviceIndependenceTensorOperations {
   Tensor Real(const Tensor& x) {
     Tensor out;
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<math::Real<T>>(
+    auto* out_data = out.mutable_data<pten::funcs::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(math::Real<T>)));
+        static_cast<size_t>(numel * sizeof(pten::funcs::Real<T>)));
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(numel);
-    math::RealFunctor<T> functor(x_data, out_data, numel);
+    pten::funcs::RealFunctor<T> functor(x_data, out_data, numel);
     for_range(functor);
     return out;
   }
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index f387dca7b7f9b2c4e741d8f495a58b05a46c6c6f..4042fcccf33090e11f14ec0effc1e5b9ddd95258 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -17,9 +17,9 @@
 #include <cstdarg>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel<T> {
     int col_u = full ? rows : k;
     int col_v = full ? cols : k;
     int batches = numel / (rows * cols);
-    auto* U_out = U->mutable_data<math::Real<T>>(
+    auto* U_out = U->mutable_data<pten::funcs::Real<T>>(
         context.GetPlace(),
-        size_t(batches * rows * col_u * sizeof(math::Real<T>)));
-    auto* VH_out = VH->mutable_data<math::Real<T>>(
+        size_t(batches * rows * col_u * sizeof(pten::funcs::Real<T>)));
+    auto* VH_out = VH->mutable_data<pten::funcs::Real<T>>(
         context.GetPlace(),
-        size_t(batches * col_v * cols * sizeof(math::Real<T>)));
-    auto* S_out = S->mutable_data<math::Real<T>>(
-        context.GetPlace(), size_t(batches * k * sizeof(math::Real<T>)));
+        size_t(batches * col_v * cols * sizeof(pten::funcs::Real<T>)));
+    auto* S_out = S->mutable_data<pten::funcs::Real<T>>(
+        context.GetPlace(), size_t(batches * k * sizeof(pten::funcs::Real<T>)));
     /*SVD Use the Eigen Library*/
     math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
   }
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index 60eeb66ae7d1eca6e093432bfdc4e5f12f47f2e9..29ba5bcc1b5bb27528ee01bbf85208978cb4f97c 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(softmax);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
index f64b016366e39b2260f4f8aebbb2e371ee2a8a7a..e892d258f3b126c0f6532f215e411837a415ee27 100644
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -19,10 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
@@ -152,7 +152,7 @@ class TriangularSolveGradKernel : public framework::OpKernel<T> {
       // calculate x's conjugate for complex
       Tensor x_conj(x->type());
       platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
-      math::ConjFunctor<T> x_functor(
+      pten::funcs::ConjFunctor<T> x_functor(
           x->data<T>(), x->numel(),
           x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
       x_for_range(x_functor);
@@ -179,7 +179,7 @@ class TriangularSolveGradKernel : public framework::OpKernel<T> {
       // calculate out's conjugate for complex
       Tensor out_conj(out->type());
       platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      math::ConjFunctor<T> out_functor(
+      pten::funcs::ConjFunctor<T> out_functor(
           out->data<T>(), out->numel(),
           out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
       out_for_range(out_functor);
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 2cd068badf51e8a3176de4ec80700ce7057862d1..ecad5340d71c1ae32339ab1c79bf37d947402747 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,3 +1,18 @@
+IF(WITH_CUSTOM_DEVICE)
+cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+
+cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+
+cc_library(stream SRCS stream.cc DEPS callback_manager)
+
+cc_library(event SRCS event.cc DEPS enforce place)
+
+cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+
+ENDIF()
+
+set(DEV_LIBS custom_device)
+
 # GPU
 IF(WITH_GPU OR WITH_ROCM)
   add_subdirectory(gpu)
@@ -22,3 +37,11 @@ ENDIF()
 IF(WITH_MLU)
   add_subdirectory(mlu)
 ENDIF()
+
+# CUSTOM
+IF(WITH_CUSTOM_DEVICE)
+  add_subdirectory(custom)
+
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
+ENDIF()
diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/fluid/platform/device/callback_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c677bc0262f0cfba0a5995afbde9e04f4bb0337e
--- /dev/null
+++ b/paddle/fluid/platform/device/callback_manager.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+CallbackManager::CallbackManager(stream::Stream *stream)
+    : stream_(stream), thread_pool_(1) {}
+
+void CallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
+
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->AddCallback(stream_, func);
+}
+
+void CallbackManager::Wait() const {
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->SynchronizeStream(stream_);
+
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/fluid/platform/device/callback_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..0edc694c94bb7846ac6081bccc0dc7fecd61adcb
--- /dev/null
+++ b/paddle/fluid/platform/device/callback_manager.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <functional>
+#include <future>  // NOLINT
+#include <memory>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+namespace stream {
+class Stream;
+}  // namespace stream
+
+// NOTE(zjl): clean CallbackManager to make compilation faster
+// Make CallbackManager thread-safe
+class CallbackManager {
+ public:
+  explicit CallbackManager(stream::Stream* stream);
+
+  ~CallbackManager() = default;
+
+  void AddCallback(std::function<void()> callback) const;
+
+  void Wait() const;
+
+ private:
+  stream::Stream* stream_;
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f39c60c0c68edcdaca4bd4a0b25a9ec07453280e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -0,0 +1,4 @@
+IF(WITH_CUSTOM_DEVICE)
+cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
+ENDIF()
diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/fluid/platform/device/custom/custom_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b98d3e2289588144e864bcbaed98f345bfad3c
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device.cc
@@ -0,0 +1,672 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
+  return d1.id == d2.id;
+}
+
+namespace paddle {
+namespace platform {
+
+class CustomDevice : public DeviceInterface {
+ public:
+  CustomDevice(const std::string& type, int priority, bool is_custom,
+               std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
+      : DeviceInterface(type, priority, is_custom),
+        pimpl_(std::move(pimpl)),
+        dso_handle_(dso_handle) {
+    Initialize();
+  }
+
+  ~CustomDevice() override { Finalize(); }
+
+  size_t GetDeviceCount() override {
+    size_t count;
+    if (pimpl_->get_device_count(&count) != C_SUCCESS) {
+      count = 0;
+    }
+    return count;
+  }
+
+  std::vector<size_t> GetDeviceList() override {
+    size_t count = GetDeviceCount();
+    std::vector<size_t> devices(count);
+    pimpl_->get_device_list(devices.data());
+    return devices;
+  }
+
+  C_DeviceInterface* Impl() { return pimpl_.get(); }
+
+  void SynchronizeDevice(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_device(device));
+  }
+
+  void Initialize() override {
+    if (pimpl_->initialize && pimpl_->initialize() != C_SUCCESS) {
+      LOG(ERROR) << "Initialize " << Type() << " Failed\n";
+      exit(-1);
+    }
+    auto devices = GetDeviceList();
+    for (auto dev_id : devices) {
+      C_Device_st device;
+      device.id = dev_id;
+      devices_pool[dev_id] = device;
+      InitDevice(dev_id);
+    }
+  }
+
+  void Finalize() override {
+    auto devices = GetDeviceList();
+    for (auto dev_id : devices) {
+      // SetDevice(dev_id);
+      // SynchronizeDevice(dev_id);
+      DeInitDevice(dev_id);
+    }
+
+    bool ok = true;
+    if (pimpl_->finalize && pimpl_->finalize() != C_SUCCESS) {
+      LOG(ERROR) << "Finalize " << Type() << " Failed\n";
+      ok = false;
+    }
+    if (dso_handle_) {
+      dlclose(dso_handle_);
+      dso_handle_ = nullptr;
+    }
+    if (!ok) {
+      exit(1);
+    }
+  }
+
+  void InitDevice(size_t dev_id) override {
+    if (pimpl_->init_device) {
+      // Core set logical id, and Plugin replace it with physical id
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->init_device(device));
+    }
+  }
+
+  void DeInitDevice(size_t dev_id) override {
+    if (pimpl_->deinit_device) {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->deinit_device(device));
+    }
+  }
+
+  void SetDevice(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->set_device(device));
+  }
+
+  int GetDevice() override {
+    C_Device_st device;
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->get_device(&device));
+    return device.id;
+  }
+
+  void CreateStream(size_t dev_id, stream::Stream* stream,
+                    const stream::Stream::Priority& priority =
+                        stream::Stream::Priority::kNormal,
+                    const stream::Stream::Flag& flag =
+                        stream::Stream::Flag::kDefaultFlag) override {
+    if (priority != stream::Stream::Priority::kNormal ||
+        flag != stream::Stream::Flag::kDefaultFlag) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "priority != stream::Stream::Priority::kNormal || flag != "
+          "stream::Stream::Flag::kDefaultFlag is not allowed on "
+          "CustomDevice."));
+    }
+    const auto device = &devices_pool[dev_id];
+    C_Stream c_stream;
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->create_stream(device, &c_stream));
+    stream->set_stream(c_stream);
+  }
+
+  void DestroyStream(size_t dev_id, stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+  }
+
+  void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+  }
+
+  bool QueryStream(size_t dev_id, const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->query_stream) {
+      SynchronizeStream(dev_id, stream);
+      return true;
+    }
+    if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
+                                         stream->raw_stream())) == C_SUCCESS) {
+      return true;
+    }
+    return false;
+  }
+
+  void AddCallback(size_t dev_id, stream::Stream* stream,
+                   stream::Stream::Callback* callback) override {
+    if (!pimpl_->stream_add_callback) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "AddCallback is not supported on %s.", Type()));
+    } else {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
+          device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+          [](C_Device device, C_Stream stream, void* user_data,
+             C_Status* status) {
+            std::unique_ptr<std::function<void()>> func(
+                reinterpret_cast<std::function<void()>*>(user_data));
+            (*func)();
+          },
+          callback));
+    }
+  }
+
+  void CreateEvent(size_t dev_id, event::Event* event,
+                   event::Event::Flag flags) override {
+    const auto device = &devices_pool[dev_id];
+    C_Event c_event;
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->create_event(device, &c_event));
+    event->set_event(c_event);
+  }
+
+  void DestroyEvent(size_t dev_id, event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_event(
+        device, reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void RecordEvent(size_t dev_id, const event::Event* event,
+                   const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_event(
+        device, reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  bool QueryEvent(size_t dev_id, const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->query_event) {
+      SynchronizeEvent(dev_id, event);
+      return true;
+    }
+    if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
+                                        event->raw_event())) == C_SUCCESS) {
+      return true;
+    }
+    return false;
+  }
+
+  void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+                       const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_h2d(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_d2h(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_d2d(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
+                     const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    int dst_dev_id = PlaceToId(dst_place);
+    auto dst_device = &devices_pool[dst_dev_id];
+    auto src_device = &devices_pool[src_dev_id];
+
+    if (stream && stream->raw_stream()) {
+      if (!pimpl_->async_memory_copy_p2p) {
+        MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
+      } else {
+        PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
+            dst_device, src_device,
+            reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
+      }
+    } else {
+      if (!pimpl_->memory_copy_p2p) {
+        std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+        MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
+        MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
+      } else {
+        auto src_place = platform::CustomPlace(Type(), src_dev_id);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        pool.Get(src_place)->Wait();
+        PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+            pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
+      }
+    }
+  }
+
+  void* MemoryAllocate(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_allocate(device, &ptr, size));
+    return ptr;
+  }
+
+  void MemoryDeallocate(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_deallocate(device, ptr, size));
+  }
+
+  void* MemoryAllocateHost(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_allocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->host_memory_allocate(device, &ptr, size));
+    }
+    return ptr;
+  }
+
+  void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->host_memory_deallocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->host_memory_deallocate(device, ptr, size));
+    }
+  }
+
+  void* MemoryAllocateUnified(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_allocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Unified is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->unified_memory_allocate(device, &ptr, size));
+    }
+    return ptr;
+  }
+
+  void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_deallocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->unified_memory_deallocate(device, ptr, size));
+    }
+  }
+
+  void MemorySet(size_t dev_id, void* ptr, uint8_t value,
+                 size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (pimpl_->device_memory_set) {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->device_memory_set(device, ptr, value, size));
+    } else {
+      std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+      memset(tmp.get(), value, size);
+      MemoryCopyH2D(dev_id, ptr, tmp.get(), size);
+    }
+  }
+
+  void MemoryStats(size_t dev_id, size_t* total, size_t* free) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_stats(device, total, free));
+
+    size_t used = *total - *free;
+    VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/"
+             << (*total >> 20) << "M, " << (*free >> 20)
+             << "M available to allocate";
+  }
+
+  size_t GetMinChunkSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    pimpl_->device_min_chunk_size(device, &size);
+    VLOG(10) << Type() << " min chunk size " << size << "B";
+    return size;
+  }
+
+  size_t GetMaxChunkSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    if (pimpl_->device_max_chunk_size) {
+      pimpl_->device_max_chunk_size(device, &size);
+      VLOG(10) << Type() << " max chunk size " << size << "B";
+    } else {
+      return DeviceInterface::GetMaxChunkSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetMaxAllocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    if (pimpl_->device_max_alloc_size) {
+      pimpl_->device_max_alloc_size(device, &size);
+      VLOG(10) << Type() << " max alloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetMaxAllocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetInitAllocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    size_t size = 0;
+    if (pimpl_->device_init_alloc_size) {
+      pimpl_->device_init_alloc_size(device, &size);
+      VLOG(10) << Type() << " init alloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetInitAllocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetReallocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    size_t size = 0;
+    if (pimpl_->device_realloc_size) {
+      pimpl_->device_realloc_size(device, &size);
+      VLOG(10) << Type() << " realloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetReallocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetExtraPaddingSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t padding_size = 0;
+    if (pimpl_->device_extra_padding_size) {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->device_extra_padding_size(device, &padding_size));
+      VLOG(10) << Type() << " extra padding size " << (padding_size >> 20)
+               << "M";
+    } else {
+      return DeviceInterface::GetExtraPaddingSize(dev_id);
+    }
+    return 0;
+  }
+
+  size_t GetComputeCapability() override {
+    size_t compute_capability = 0;
+    if (pimpl_->get_compute_capability) {
+      pimpl_->get_compute_capability(&compute_capability);
+    }
+    VLOG(10) << Type() << " get compute capability " << compute_capability;
+    return compute_capability;
+  }
+
+  size_t GetRuntimeVersion() override {
+    size_t version = 0;
+    if (pimpl_->get_runtime_version) {
+      pimpl_->get_runtime_version(&version);
+    }
+    VLOG(10) << Type() << " get runtime version " << version;
+    return version;
+  }
+
+  size_t GetDriverVersion() override {
+    size_t version = 0;
+    if (pimpl_->get_driver_version) {
+      pimpl_->get_driver_version(&version);
+    }
+    VLOG(10) << Type() << " get driver version " << version;
+    return version;
+  }
+
+ private:
+  inline int PlaceToIdNoCheck(const Place& place) {
+    int dev_id = place.GetDeviceId();
+    return dev_id;
+  }
+
+  inline int PlaceToId(const Place& place) {
+    int dev_id = PlaceToIdNoCheck(place);
+    PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
+                      platform::errors::NotFound(
+                          "Cannot found %s %d, please check visible devices",
+                          Type(), dev_id));
+    return dev_id;
+  }
+
+  std::unique_ptr<C_DeviceInterface> pimpl_;
+  void* dso_handle_;
+  std::unordered_map<size_t, C_Device_st> devices_pool;
+};
+
+bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
+#define CHECK_PTR(ptr, required)                                   \
+  if (params->interface->ptr == nullptr && required) {             \
+    LOG(WARNING) << "CustomRuntime [type: " << params->device_type \
+                 << "] pointer: " << #ptr << " is not set.";       \
+    return false;                                                  \
+  }
+
+  int version = params->version.major * 10000 + params->version.minor * 100 +
+                params->version.patch;
+  const int runtime_version = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION * 10000 +
+                              PADDLE_CUSTOM_RUNTIME_MINOR_VERSION * 100 +
+                              PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
+
+  if (version < runtime_version) {
+    LOG(WARNING) << "CustomRuntime [type: " << params->device_type
+                 << "] version: " << version
+                 << " < PADDLE_CUSTOM_RUNTIME_VERSION " << runtime_version;
+    return false;
+  }
+
+  CHECK_PTR(initialize, false);
+  CHECK_PTR(finalize, false)
+
+  CHECK_PTR(init_device, false);
+  CHECK_PTR(set_device, true);
+  CHECK_PTR(get_device, true);
+  CHECK_PTR(deinit_device, false);
+
+  CHECK_PTR(create_stream, true);
+  CHECK_PTR(destroy_stream, true);
+  CHECK_PTR(query_stream, false);
+  CHECK_PTR(stream_add_callback, false);
+
+  CHECK_PTR(create_event, true);
+  CHECK_PTR(record_event, true);
+  CHECK_PTR(destroy_event, true);
+  CHECK_PTR(query_event, false);
+
+  CHECK_PTR(synchronize_device, false);
+  CHECK_PTR(synchronize_stream, true);
+  CHECK_PTR(synchronize_event, true);
+  CHECK_PTR(stream_wait_event, true);
+
+  CHECK_PTR(device_memory_allocate, true);
+  CHECK_PTR(device_memory_deallocate, true);
+  CHECK_PTR(host_memory_allocate, false);
+  CHECK_PTR(host_memory_deallocate, false);
+  CHECK_PTR(unified_memory_allocate, false);
+  CHECK_PTR(unified_memory_deallocate, false);
+  CHECK_PTR(memory_copy_h2d, true);
+  CHECK_PTR(memory_copy_d2h, true);
+  CHECK_PTR(memory_copy_d2d, true);
+  CHECK_PTR(memory_copy_p2p, false);
+  CHECK_PTR(async_memory_copy_h2d, false);
+  CHECK_PTR(async_memory_copy_d2h, false);
+  CHECK_PTR(async_memory_copy_d2d, false);
+  CHECK_PTR(async_memory_copy_p2p, false);
+
+  CHECK_PTR(get_device_count, true);
+  CHECK_PTR(get_device_list, true);
+  CHECK_PTR(device_memory_stats, true);
+
+  CHECK_PTR(device_min_chunk_size, true);
+  CHECK_PTR(device_max_chunk_size, false);
+  CHECK_PTR(device_max_alloc_size, false);
+  CHECK_PTR(device_extra_padding_size, false);
+  CHECK_PTR(get_compute_capability, false);
+  CHECK_PTR(get_runtime_version, false);
+  CHECK_PTR(get_driver_version, false);
+
+  return true;
+#undef CHECK_PTR
+}
+
+typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
+
+bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
+                          std::unique_ptr<C_DeviceInterface> device_interface,
+                          void* dso_handle) {
+  if (ValidCustomCustomRuntimeParams(&runtime_params)) {
+    auto device =
+        std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
+                                       std::move(device_interface), dso_handle);
+    if (false == DeviceManager::Register(std::move(device))) {
+      LOG(WARNING) << "Skip this library. Register failed!!! there may be a "
+                      "Custom Runtime with the same name.";
+      return false;
+    }
+  } else {
+    LOG(WARNING)
+        << "Skip this library. Wrong parameters!!! please check the version "
+           "compatibility between PaddlePaddle and Custom Runtime.";
+    return false;
+  }
+  return true;
+}
+
+bool LoadCustomRuntimeLib(void* dso_handle) {
+  CustomRuntimeParams runtime_params;
+  std::memset(&runtime_params, 0, sizeof(CustomRuntimeParams));
+  runtime_params.size = sizeof(CustomRuntimeParams);
+  auto device_interface = std::make_unique<C_DeviceInterface>();
+  runtime_params.interface = device_interface.get();
+  std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
+  runtime_params.interface->size = sizeof(C_DeviceInterface);
+
+  RegisterDevicePluginFn init_plugin_fn =
+      reinterpret_cast<RegisterDevicePluginFn>(dlsym(dso_handle, "InitPlugin"));
+  if (!init_plugin_fn) {
+    LOG(WARNING) << "Skip this library. InitPlugin symbol not found.";
+    return false;
+  }
+  init_plugin_fn(&runtime_params);
+  if (runtime_params.device_type == nullptr) {
+    LOG(WARNING)
+        << "Skip this library. InitPlugin failed!!! please check the version "
+           "compatibility between PaddlePaddle and Custom Runtime.";
+    return false;
+  }
+  return LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
+                              dso_handle);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/fluid/platform/device/custom/custom_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a874ea221228ef016ad3bff60620f949582cf9e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device_context.h"
+
+void RegisterDevice() {
+  CustomRuntimeParams runtime_params;
+  runtime_params.size = sizeof(CustomRuntimeParams);
+  auto device_interface = std::make_unique<C_DeviceInterface>();
+  runtime_params.interface = device_interface.get();
+  std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
+  runtime_params.interface->size = sizeof(C_DeviceInterface);
+
+  InitFakeCPUDevice(&runtime_params);
+  EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), nullptr));
+}
+
+void InitDevice() {
+  RegisterDevice();
+  EXPECT_GT(static_cast<int>(
+                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+            0);
+  auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
+  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  EXPECT_NE(device, nullptr);
+
+  std::vector<paddle::platform::Place> places;
+  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (auto dev_type : device_types) {
+    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    for (auto dev_id : devices) {
+      places.push_back(
+          paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
+    }
+  }
+  EXPECT_GT(static_cast<int>(places.size()), 0);
+
+  paddle::platform::DeviceContextPool::Init(places);
+}
+
+void TestDeviceInterface(const paddle::platform::Place& place) {
+  std::cout << "TestDeviceInterface on " << place << std::endl;
+  if (paddle::platform::is_custom_place(place)) {
+    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
+    auto p1 = device->MemoryAllocate(
+        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    EXPECT_NE(p1, nullptr);
+
+    paddle::platform::DeviceManager::SetDevice(place);
+    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    EXPECT_EQ(dev_id, place.GetDeviceId());
+  }
+}
+
+void TestTensorMutableData(const paddle::platform::Place& place) {
+  std::cout << "TestTensorInitialization on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  float* p1 = nullptr;
+  float* p2 = nullptr;
+  // initialization
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({1, 2, 3}),
+                                      place);
+  auto p1_holder = src_tensor.Holder();
+  EXPECT_NE(p1, nullptr);
+  // set src_tensor a new dim with large size
+  // momery is supposed to be re-allocated
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({3, 1024}),
+                                      place);
+  auto p2_holder = src_tensor.Holder();
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p1_holder.get(), p2_holder.get());
+  // set src_tensor a new dim with same size
+  // momery block is supposed to be unchanged
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2, 3}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+  // set src_tensor a new dim with smaller size
+  // momery block is supposed to be unchanged
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+}
+
+void TestTensorShareDataWith(const paddle::platform::Place& place) {
+  std::cout << "TestTensorShareDataWith on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor dst_tensor;
+  src_tensor.mutable_data<int>(paddle::framework::make_ddim({2, 3, 4}), place);
+  dst_tensor.ShareDataWith(src_tensor);
+  ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+}
+
+void TestTensorUtils(const paddle::platform::Place& place) {
+  if (paddle::platform::is_custom_place(place) == false) {
+    return;
+  }
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor gpu_tensor;
+  paddle::framework::Tensor dst_tensor;
+
+  int* src_ptr = src_tensor.mutable_data<int>(
+      paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  // CPU Tensor to GPU Tensor
+  paddle::platform::CustomDeviceContext gpu_ctx(place);
+  paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor);
+#if 0
+  // GPU Tensor to CPU Tensor
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Tensors
+  gpu_ctx.Wait();
+  const int* dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  // Copy the same tensor
+  paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor);
+  gpu_ctx.Wait();
+  const int* dst_ptr_tmp = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr_tmp);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+  }
+
+  paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+  // CPU Slice Tensor to GPU Tensor
+  paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
+
+  // GPU Tensor to CPU Tensor
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Slice Tensors
+  gpu_ctx.Wait();
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+#endif
+}
+
+TEST(CustomDevice, Tensor) {
+  InitDevice();
+  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (const auto& dev_type : dev_types) {
+    std::cout << "Test on " << dev_type << std::endl;
+    EXPECT_GT(static_cast<int>(
+                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+              0);
+    auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
+
+    TestDeviceInterface(place);
+    TestTensorMutableData(place);
+    TestTensorShareDataWith(place);
+    TestTensorUtils(place);
+  }
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbdb4627aba2662a2a12cc933a3a4c6e61aa55d5
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_ext.h"
+
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T>
+struct CustomDeviceStatusType {};
+
+#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \
+  template <>                                                 \
+  struct CustomDeviceStatusType<type> {                       \
+    using Type = type;                                        \
+    static constexpr Type kSuccess = success_value;           \
+  }
+
+DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS);
+}  // namespace details
+
+inline std::string build_custom_device_error_msg(C_Status stat) {
+  std::ostringstream sout;
+  sout << " CustomDevice error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND)                      \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__);           \
+    constexpr auto __success_type__ =                                   \
+        ::paddle::platform::details::CustomDeviceStatusType<            \
+            __CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess;                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = ::paddle::platform::errors::External(          \
+          ::paddle::platform::build_custom_device_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+}  // namespace platform
+}  // namespace paddle
+#endif  // PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/fluid/platform/device/custom/fake_cpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6d8ade4b08597b2c17e5df9dc333c3c4f70d69e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/fake_cpu_device.h
@@ -0,0 +1,185 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_ext.h"
+
+constexpr size_t global_total_memory = 1024 * 1024UL;
+static size_t global_free_memory = global_total_memory;
+
+C_Status Init() { return C_SUCCESS; }
+
+C_Status InitDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SetDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status GetDevice(const C_Device device) {
+  device->id = 0;
+  return C_SUCCESS;
+}
+
+C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status Finalize() { return C_SUCCESS; }
+
+C_Status GetDevicesCount(size_t *count) {
+  *count = 1;
+  return C_SUCCESS;
+}
+
+C_Status GetDevicesList(size_t *device) {
+  *device = 0;
+  return C_SUCCESS;
+}
+
+C_Status MemCpy(const C_Device device, void *dst, const void *src,
+                size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
+                     const void *src, size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status Allocate(const C_Device device, void **ptr, size_t size) {
+  if (global_free_memory >= size) {
+    *ptr = malloc(size);
+    global_free_memory -= size;
+    return C_SUCCESS;
+  } else {
+    *ptr = nullptr;
+    return C_FAILED;
+  }
+}
+
+C_Status Deallocate(const C_Device device, void *ptr, size_t size) {
+  free(ptr);
+  global_free_memory += size;
+  return C_SUCCESS;
+}
+
+C_Status CreateStream(const C_Device device, C_Stream *stream) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status CreateEvent(const C_Device device, C_Event *event) {
+  return C_SUCCESS;
+}
+
+C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyEvent(const C_Device device, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status SyncDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SyncStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
+
+C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+                         C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
+
+C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+                        size_t *free_memory) {
+  *total_memory = global_total_memory;
+  *free_memory = global_free_memory;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMinChunkSize(const C_Device device, size_t *size) {
+  *size = 4 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) {
+  *size = 64 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
+  *size = global_total_memory * 0.95;
+  return C_SUCCESS;
+}
+
+#define DEVICE_TYPE "FakeCPU"
+#define SUB_DEVICE_TYPE "V100"
+
+void InitFakeCPUDevice(CustomRuntimeParams *params) {
+  params->device_type = const_cast<char *>(DEVICE_TYPE);
+  params->sub_device_type = const_cast<char *>(SUB_DEVICE_TYPE);
+  params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION;
+  params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
+  params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
+
+  memset(reinterpret_cast<void *>(params->interface), 0,
+         sizeof(C_DeviceInterface));
+
+  params->interface->initialize = Init;
+  params->interface->finalize = Finalize;
+
+  params->interface->init_device = InitDevice;
+  params->interface->set_device = SetDevice;
+  params->interface->get_device = GetDevice;
+  params->interface->deinit_device = DestroyDevice;
+
+  params->interface->create_stream = CreateStream;
+  params->interface->destroy_stream = DestroyStream;
+
+  params->interface->create_event = CreateEvent;
+  params->interface->destroy_event = DestroyEvent;
+  params->interface->record_event = RecordEvent;
+
+  params->interface->synchronize_device = SyncDevice;
+  params->interface->synchronize_stream = SyncStream;
+  params->interface->synchronize_event = SyncEvent;
+  params->interface->stream_wait_event = StreamWaitEvent;
+
+  params->interface->memory_copy_h2d = MemCpy;
+  params->interface->memory_copy_d2d = MemCpy;
+  params->interface->memory_copy_d2h = MemCpy;
+  params->interface->async_memory_copy_h2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2h = AsyncMemCpy;
+  params->interface->device_memory_allocate = Allocate;
+  params->interface->host_memory_allocate = Allocate;
+  params->interface->unified_memory_allocate = Allocate;
+  params->interface->device_memory_deallocate = Deallocate;
+  params->interface->host_memory_deallocate = Deallocate;
+  params->interface->unified_memory_deallocate = Deallocate;
+
+  params->interface->get_device_count = GetDevicesCount;
+  params->interface->get_device_list = GetDevicesList;
+  params->interface->device_memory_stats = DeviceMemStats;
+
+  params->interface->device_max_chunk_size = DeviceMaxChunkSize;
+  params->interface->device_min_chunk_size = DeviceMinChunkSize;
+  params->interface->device_max_alloc_size = DeviceMaxAllocSize;
+}
diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/fluid/platform/device/device_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6234c9612687e507acd2642ef1d39cc0f8da4539
--- /dev/null
+++ b/paddle/fluid/platform/device/device_base.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "gflags/gflags.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+namespace paddle {
+namespace platform {
+
+#define INTERFACE_UNIMPLEMENT                   \
+  PADDLE_THROW(platform::errors::Unimplemented( \
+      "%s is not implemented on %s device.", __func__, Type()));
+
+// info
+size_t DeviceInterface::GetComputeCapability() {
+  VLOG(10) << Type() + " get compute capability " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetRuntimeVersion() {
+  VLOG(10) << Type() + " get runtime version " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetDriverVersion() {
+  VLOG(10) << Type() + " get driver version " << 0;
+  return 0;
+}
+
+// device manage
+void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SynchronizeDevice(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
+
+// stream manage
+void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+                                   const stream::Stream::Priority& priority,
+                                   const stream::Stream::Flag& flag) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeStream(size_t dev_id,
+                                        const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+                                  stream::Stream::Callback* callback) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::StreamWaitEvent(size_t dev_id,
+                                      const stream::Stream* stream,
+                                      const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+// event manage
+void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+                                  event::Event::Flag flags) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+                                  const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeEvent(size_t dev_id,
+                                       const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+// memery manage
+void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
+                                    size_t src_id, const void* src, size_t size,
+                                    const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+                                           size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+                                              size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+                                size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
+  size_t available_to_alloc = AvailableAllocSize(dev_id);
+  PADDLE_ENFORCE_GT(available_to_alloc, 0,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  return alloc_bytes;
+}
+
+size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
+  size_t total = 0;
+  size_t available = 0;
+  MemoryStats(dev_id, &total, &available);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GetMinChunkSize(dev_id);
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  return available_to_alloc;
+}
+
+size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
+  size_t init_alloc_size = AllocSize(dev_id, false);
+  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  return init_alloc_size;
+}
+
+size_t DeviceInterface::GetReallocSize(size_t dev_id) {
+  size_t realloc_size = AllocSize(dev_id, true);
+  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  return realloc_size;
+}
+
+size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
+  size_t max_alloc_size =
+      std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
+  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  return max_alloc_size;
+}
+
+size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
+  size_t max_chunk_size = GetMaxAllocSize(dev_id);
+  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
+  VLOG(10) << Type() + " extra padding size " << 0;
+  return 0;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/fluid/platform/device/device_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d70b02be80eacd9d492b8a8d40c0a074dfe9c6e3
--- /dev/null
+++ b/paddle/fluid/platform/device/device_base.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceInterface {  // Driver / Runtime
+ public:
+  DeviceInterface(const std::string& type, uint8_t priority, bool is_custom)
+      : type_(type), priority_(priority), is_custom_(is_custom) {}
+  uint8_t Priority() { return priority_; }
+  std::string Type() { return type_; }
+  bool IsCustom() { return is_custom_; }
+
+  virtual ~DeviceInterface() {}
+
+  // Info
+  virtual size_t GetComputeCapability();
+
+  virtual size_t GetRuntimeVersion();
+
+  virtual size_t GetDriverVersion();
+
+  // Platform
+  //! Initialize
+  virtual void Initialize();
+
+  //! Finalize
+  virtual void Finalize();
+
+  // Device
+  virtual size_t GetDeviceCount() = 0;
+  virtual std::vector<size_t> GetDeviceList() = 0;
+
+  //! Wait for compute device to finish.
+  virtual void SynchronizeDevice(size_t dev_id);
+
+  //! Initialize device.
+  virtual void InitDevice(size_t dev_id);
+
+  //! Deinitialize device.
+  virtual void DeInitDevice(size_t dev_id);
+
+  // ! Set device to be used.
+  virtual void SetDevice(size_t dev_id);
+
+  // ! Returns which device is currently being used.
+  virtual int GetDevice();
+
+  // Stream
+  // ! Create an asynchronous stream
+  virtual void CreateStream(
+      size_t dev_id, stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  virtual void DestroyStream(size_t dev_id, stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+                           stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  virtual void CreateEvent(size_t dev_id, event::Event* event,
+                           event::Event::Flag flags);
+
+  // ! Destroy an event.
+  virtual void DestroyEvent(size_t dev_id, event::Event* event);
+
+  // ! Records an event.
+  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+                           const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  virtual void SynchronizeEvent(size_t dev_id, const event::Event* event);
+  // ! Queries an event for completion status.
+  virtual bool QueryEvent(size_t dev_id, const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+                               const event::Event* event);
+
+  // Memory
+  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
+                             const void* src, size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void* MemoryAllocate(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateHost(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateUnified(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size);
+
+  virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size);
+
+  virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free);
+
+  virtual size_t GetMinChunkSize(size_t dev_id);
+
+  virtual size_t GetInitAllocSize(size_t dev_id);
+
+  virtual size_t GetReallocSize(size_t dev_id);
+
+  virtual size_t GetMaxAllocSize(size_t dev_id);
+
+  virtual size_t GetMaxChunkSize(size_t dev_id);
+
+  virtual size_t GetExtraPaddingSize(size_t dev_id);
+
+ private:
+  const std::string type_;
+  const uint8_t priority_;
+  const bool is_custom_;
+
+  size_t AllocSize(size_t dev_id, bool realloc);
+
+  size_t AvailableAllocSize(size_t dev_id);
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/fluid/platform/device/device_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1e1340f74b7741f867b85d7ab0b1e42c9621a47
--- /dev/null
+++ b/paddle/fluid/platform/device/device_ext.h
@@ -0,0 +1,497 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+#include <cstddef>
+#include <cstring>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0
+#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
+#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
+
+typedef enum {
+  C_SUCCESS = 0,    // success
+  C_WARNING,        // results may not meet expectation (such as an asynchronous
+                    // interface is actually synchronous)
+  C_FAILED,         // resource exhausted/query failed
+  C_ERROR,          // invalid argument/wrong usage/uninitialized
+  C_INTERNAL_ERROR  // plugin error
+} C_Status;
+
+typedef struct C_Device_st { int id; } * C_Device;
+
+typedef struct C_Stream_st* C_Stream;
+
+typedef struct C_Event_st* C_Event;
+
+typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+                           C_Status* status);
+
+struct C_DeviceInterface {
+  // Core fill it and plugin must to check it
+  size_t size;
+
+  ///////////////////////
+  // device manage api //
+  ///////////////////////
+
+  /**
+   * @brief Initialize hardware
+   *
+   */
+  C_Status (*initialize)();
+
+  /**
+   * @brief Deinitialize hardware
+   *
+   */
+  C_Status (*finalize)();
+
+  /**
+   * @brief Initialize device
+   *
+   * @param[C_Device] device     Core fill it with a logical id, and then plugin
+   * must replace it with a physical id
+   */
+  C_Status (*init_device)(const C_Device device);
+
+  /**
+   * @brief Set current device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*set_device)(const C_Device device);
+
+  /**
+   * @brief Get current device
+   *
+   * @param[C_Device] device     Plugin fill it with a physical id
+   */
+  C_Status (*get_device)(const C_Device device);
+
+  /**
+   * @brief Deinitialize device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*deinit_device)(const C_Device device);
+
+  /**
+   * @brief Create a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream*] stream    Plugin create a stream and fill it
+   */
+  C_Status (*create_stream)(const C_Device device, C_Stream* stream);
+
+  /**
+   * @brief Destroy a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*destroy_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Query a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*query_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Add a callback to stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Callback] callback
+   * @param[void*]      user_data
+   */
+  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
+                                  C_Callback callback, void* user_data);
+
+  /**
+   * @brief Create an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event*]   event      Plugin create an event and fill it
+   */
+  C_Status (*create_event)(const C_Device device, C_Event* event);
+
+  /**
+   * @brief Record an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*record_event)(const C_Device device, C_Stream stream,
+                           C_Event event);
+
+  /**
+   * @brief Destroy an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*destroy_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Query an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*query_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Synchronize a device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   */
+  C_Status (*synchronize_device)(const C_Device device);
+
+  /**
+   * @brief Synchronize a stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   */
+  C_Status (*synchronize_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Synchronize an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*synchronize_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Make a stream wait on an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+                                C_Event event);
+
+  void* reserved_dev_api[8];
+
+  ///////////////////////
+  // memory manage api //
+  ///////////////////////
+
+  /**
+   * @brief Device memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+                                     size_t size);
+
+  /**
+   * @brief Device memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+                                       size_t size);
+
+  /**
+   * @brief Device memory set
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[unsigned char] value
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_set)(const C_Device device, void* ptr,
+                                unsigned char value, size_t size);
+
+  /**
+   * @brief Host memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+                                   size_t size);
+
+  /**
+   * @brief Host memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+                                     size_t size);
+
+  /**
+   * @brief Unified memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+                                      size_t size);
+
+  /**
+   * @brief Unified memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+                                        size_t size);
+
+  /**
+   * @brief Memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Peer memory copy from device to device
+   *
+   * @param[C_Device]   dst_device     Core fill it with a physical id
+   * @param[C_Device]   src_device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_p2p)(const C_Device dst_device,
+                              const C_Device src_device, void* dst,
+                              const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Peer asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
+                                    const C_Device src_device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  void* reserved_mem_api[8];
+
+  //////////////
+  // info api //
+  //////////////
+
+  /**
+   * @brief Get visible device count
+   *
+   * @param[size_t*]    count       Plugin fill it
+   */
+  C_Status (*get_device_count)(size_t* count);
+
+  /**
+   * @brief Get visible device list
+   *
+   * @param[size_t*]    devices     Plugin fill it
+   */
+  C_Status (*get_device_list)(size_t* devices);
+
+  /**
+   * @brief Device memory statistic
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[size_t*]    total_memory
+   * @param[size_t*]    free_memory
+   * @param[size_t*]    used_memory
+   */
+  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+                                  size_t* free_memory);
+
+  /**
+   * @brief Device minimum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_min_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum alloc size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_alloc_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device extra padding size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_extra_padding_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device initial allocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_init_alloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device reallocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_realloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Get compute capability
+   *
+   * @param[size_t*]    compute_capability
+   */
+  C_Status (*get_compute_capability)(size_t* compute_capability);
+
+  /**
+   * @brief Get runtime version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_runtime_version)(size_t* version);
+
+  /**
+   * @brief Get driver version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_driver_version)(size_t* version);
+
+  void* reserved_info_api[8];
+
+  ///////////////
+  // other api //
+  ///////////////
+
+  void* reserved_other_api[8];
+};
+
+struct CustomRuntimeVersion {
+  size_t major, minor, patch;
+};
+
+struct CustomRuntimeParams {
+  // Core fill it and plugin must to check it
+  size_t size;
+  // Plugin fill it
+  C_DeviceInterface* interface;
+  // Plugin fill it and Core will to check it
+  CustomRuntimeVersion version;
+  // Plugin fill it
+  char* device_type;
+  // Plugin fill it
+  char* sub_device_type;
+
+  char reserved[32];
+};
+
+// Plugin implement it and fill CustomRuntimeParams
+void InitPlugin(CustomRuntimeParams*);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/fluid/platform/device/device_guard.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55d8b9dc6a9a58dda5ae8192709e6858da878da7
--- /dev/null
+++ b/paddle/fluid/platform/device/device_guard.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_guard.h"
+
+namespace paddle {
+namespace platform {
+// Even this source file does not contains any code, it is better to keep this
+// source file for cmake dependency.
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/fluid/platform/device/device_guard.h
new file mode 100644
index 0000000000000000000000000000000000000000..638e9c984b4d25e474fd5949e9fdc5df98a344ef
--- /dev/null
+++ b/paddle/fluid/platform/device/device_guard.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_manager.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceGuard {
+ public:
+  explicit inline DeviceGuard(const Place& place)
+      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+    prev_id = DeviceManager::GetDevice(dev_type_);
+    cur_id = PlaceHelper::GetDeviceId(place);
+
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, cur_id);
+    }
+  }
+
+  inline ~DeviceGuard() {
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, prev_id);
+    }
+  }
+
+  DeviceGuard(const DeviceGuard& o) = delete;
+  DeviceGuard& operator=(const DeviceGuard& o) = delete;
+
+ private:
+  size_t prev_id, cur_id;
+  std::string dev_type_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/fluid/platform/device/device_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38dcb721b1faeac8bc14b49cf7f0957406d4c590
--- /dev/null
+++ b/paddle/fluid/platform/device/device_manager.cc
@@ -0,0 +1,420 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_manager.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#else
+
+#endif
+
+#include <functional>
+#include <regex>
+
+namespace paddle {
+namespace platform {
+
+void Device::CreateStream(stream::Stream* stream,
+                          const stream::Stream::Priority& priority,
+                          const stream::Stream::Flag& flag) {
+  impl_->CreateStream(dev_id_, stream, priority, flag);
+}
+
+void Device::DestroyStream(stream::Stream* stream) {
+  impl_->DestroyStream(dev_id_, stream);
+}
+
+void Device::SynchronizeStream(const stream::Stream* stream) {
+  impl_->SynchronizeStream(dev_id_, stream);
+}
+
+bool Device::QueryStream(const stream::Stream* stream) {
+  return impl_->QueryStream(dev_id_, stream);
+}
+
+void Device::AddCallback(stream::Stream* stream,
+                         stream::Stream::Callback* callback) {
+  impl_->AddCallback(dev_id_, stream, callback);
+}
+
+void Device::CreateEvent(event::Event* event, event::Event::Flag flags) {
+  impl_->CreateEvent(dev_id_, event, flags);
+}
+
+void Device::DestroyEvent(event::Event* event) {
+  impl_->DestroyEvent(dev_id_, event);
+}
+
+void Device::RecordEvent(const event::Event* event,
+                         const stream::Stream* stream) {
+  impl_->RecordEvent(dev_id_, event, stream);
+}
+
+void Device::SynchronizeEvent(const event::Event* event) {
+  impl_->SynchronizeEvent(dev_id_, event);
+}
+
+bool Device::QueryEvent(const event::Event* event) {
+  return impl_->QueryEvent(dev_id_, event);
+}
+
+void Device::StreamWaitEvent(const stream::Stream* stream,
+                             const event::Event* event) {
+  impl_->StreamWaitEvent(dev_id_, stream, event);
+}
+
+void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                           size_t size, const stream::Stream* stream) {
+  impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
+}
+
+void* Device::MemoryAllocate(size_t size) {
+  return impl_->MemoryAllocate(dev_id_, size);
+}
+
+void Device::MemoryDeallocate(void* ptr, size_t size) {
+  impl_->MemoryDeallocate(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateHost(size_t size) {
+  return impl_->MemoryAllocateHost(dev_id_, size);
+}
+
+void Device::MemoryDeallocateHost(void* ptr, size_t size) {
+  impl_->MemoryDeallocateHost(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateUnified(size_t size) {
+  return impl_->MemoryAllocateUnified(dev_id_, size);
+}
+
+void Device::MemoryDeallocateUnified(void* ptr, size_t size) {
+  impl_->MemoryDeallocateUnified(dev_id_, ptr, size);
+}
+
+void Device::MemorySet(void* ptr, uint8_t value, size_t size) {
+  impl_->MemorySet(dev_id_, ptr, value, size);
+}
+
+std::string Device::Type() { return impl_->Type(); }
+
+static pten::RWLock _global_device_manager_rw_lock;
+
+bool DeviceManager::Register(std::unique_ptr<DeviceInterface> device_impl) {
+  pten::AutoWRLock lock(&_global_device_manager_rw_lock);
+  VLOG(4) << "Register Device - " << device_impl->Type();
+  auto device_type = device_impl->Type();
+  auto& dev_impl_map = Instance().device_impl_map_;
+  auto& dev_map = Instance().device_map_;
+
+  if (dev_impl_map.find(device_type) == dev_impl_map.end()) {
+    dev_impl_map.insert(
+        std::pair<std::string, std::unique_ptr<DeviceInterface>>(
+            device_type, std::move(device_impl)));
+    auto& dev_impl = dev_impl_map[device_type];
+    auto& dev_vec = dev_map[device_type];
+    VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+    for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+      dev_vec.emplace_back(new Device(i, dev_impl.get()));
+    }
+  } else {
+    auto& plat = dev_impl_map[device_type];
+    if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) {
+      dev_impl_map[device_type] = std::move(device_impl);
+      auto& dev_impl = dev_impl_map[device_type];
+      auto& dev_vec = dev_map[device_type];
+      dev_vec.clear();
+      VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+      for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+        dev_vec.emplace_back(new Device(i, dev_impl.get()));
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
+    const std::string& device_type) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_impl_map = Instance().device_impl_map_;
+  if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
+    return dev_impl_map.at(device_type).get();
+  } else {
+    LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
+    PADDLE_THROW(
+        platform::errors::Fatal("Unregistered device type %s.", device_type));
+    return nullptr;
+  }
+}
+
+Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_map = Instance().device_map_;
+  auto dev_type = PlaceHelper::GetDeviceType(place);
+  auto dev_id = PlaceHelper::GetDeviceId(place);
+  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
+                    platform::errors::NotFound(
+                        "Unable to find Device with type %s.", dev_type));
+  auto& dev_vec = dev_map[dev_type];
+  PADDLE_ENFORCE_LT(
+      dev_id, dev_vec.size(),
+      platform::errors::OutOfRange(
+          "The visible devices count of type %s is %d, but dev_id is %d.",
+          dev_type, dev_vec.size(), dev_id));
+  return dev_vec[dev_id].get();
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    devices.push_back(iter->first);
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    if (iter->second->IsCustom()) {
+      devices.push_back(iter->first);
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (device_count == 1) {
+      devices.push_back(dev_type);
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        devices.push_back(dev_type + ":" + std::to_string(i));
+      }
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (iter->second->IsCustom()) {
+      if (device_count == 1) {
+        devices.push_back(dev_type);
+      } else {
+        for (size_t i = 0; i < device_count; ++i) {
+          devices.push_back(dev_type + ":" + std::to_string(i));
+        }
+      }
+    }
+  }
+  return devices;
+}
+
+bool DeviceManager::HasDeviceType(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl != nullptr;
+}
+
+bool DeviceManager::IsCustom(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->IsCustom();
+}
+
+void DeviceManager::Initialize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Initialize();
+}
+
+void DeviceManager::Finalize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Finalize();
+}
+
+void DeviceManager::SynchronizeDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SynchronizeDevice(device_id);
+}
+
+void DeviceManager::InitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->InitDevice(device_id);
+}
+
+void DeviceManager::DeInitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->DeInitDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const std::string& device_type,
+                              size_t device_id) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SetDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  DeviceManager::SetDevice(device_type, device_id);
+}
+
+int DeviceManager::GetDevice(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDevice();
+}
+
+size_t DeviceManager::GetMinChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMinChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxAllocSize(device_id);
+}
+
+size_t DeviceManager::GetInitAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetInitAllocSize(device_id);
+}
+
+size_t DeviceManager::GetReallocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetReallocSize(device_id);
+}
+
+size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetExtraPaddingSize(device_id);
+}
+
+void DeviceManager::MemoryStats(const Place& place, size_t* total,
+                                size_t* free) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->MemoryStats(device_id, total, free);
+}
+
+size_t DeviceManager::GetDeviceCount(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceCount();
+}
+
+std::vector<size_t> DeviceManager::GetDeviceList(
+    const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceList();
+}
+
+DeviceManager& DeviceManager::Instance() {
+  static DeviceManager platform_manager;
+  return platform_manager;
+}
+
+std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
+  std::vector<std::string> libraries;
+  std::regex express(".*\\.so");
+  std::match_results<std::string::iterator> results;
+  DIR* dir = nullptr;
+  dirent* ptr = nullptr;
+
+  dir = opendir(library_dir.c_str());
+  if (dir == nullptr) {
+    VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed";
+  } else {
+    while ((ptr = readdir(dir)) != nullptr) {
+      std::string filename(ptr->d_name);
+      if (std::regex_match(filename.begin(), filename.end(), results,
+                           express)) {
+        libraries.push_back(library_dir + '/' + filename);
+        VLOG(4) << "found CustomDevice library: " << libraries.back()
+                << std::endl;
+      }
+    }
+    closedir(dir);
+  }
+
+  return libraries;
+}
+
+bool LoadCustomDevice(const std::string& library_dir) {
+  std::vector<std::string> libs = ListAllLibraries(library_dir);
+  for (const auto& lib_path : libs) {
+    auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
+    LoadCustomRuntimeLib(dso_handle);
+  }
+  return true;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/fluid/platform/device/device_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad910605d987aed726c41ff242434979aa2bb058
--- /dev/null
+++ b/paddle/fluid/platform/device/device_manager.h
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/pten/backends/dynload/port.h"
+#include "paddle/pten/core/utils/rw_lock.h"
+
+namespace paddle {
+namespace platform {
+class Device final {
+ public:
+  Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
+
+  // Stream
+  // ! Create an asynchronous stream
+  void CreateStream(
+      stream::Stream* stream, const stream::Stream::Priority& priority =
+                                  stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  void DestroyStream(stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  void SynchronizeStream(const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  bool QueryStream(const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  void CreateEvent(event::Event* event, event::Event::Flag flags);
+
+  // ! Destroy an event.
+  void DestroyEvent(event::Event* event);
+
+  // ! Records an event.
+  void RecordEvent(const event::Event* event, const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  void SynchronizeEvent(const event::Event* event);
+
+  // ! Queries an event for completion status.
+  bool QueryEvent(const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
+
+  // Memory
+  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                     size_t size, const stream::Stream* stream = nullptr);
+
+  void* MemoryAllocate(size_t size);
+
+  void MemoryDeallocate(void* ptr, size_t size);
+
+  void* MemoryAllocateHost(size_t size);
+
+  void MemoryDeallocateHost(void* ptr, size_t size);
+
+  void* MemoryAllocateUnified(size_t size);
+
+  void MemoryDeallocateUnified(void* ptr, size_t size);
+
+  void MemorySet(void* ptr, uint8_t value, size_t size);
+
+  std::string Type();
+
+ private:
+  size_t dev_id_;
+  DeviceInterface* impl_;
+};
+
+class DeviceManager {
+ public:
+  static bool Register(std::unique_ptr<DeviceInterface> device);
+  static bool RegisterPinnedDevice(DeviceInterface* device);
+  static Device* GetDeviceWithPlace(const Place& place);
+  static std::vector<std::string> GetAllDeviceTypes();
+  static std::vector<std::string> GetAllCustomDeviceTypes();
+  static std::vector<std::string> GetAllDeviceList();
+  static std::vector<std::string> GetAllCustomDeviceList();
+  static bool HasDeviceType(const std::string& device_type);
+  static bool IsCustom(const std::string& device_type);
+
+  // platform & device
+  static void Initialize(const std::string& device_type);
+
+  static void Finalize(const std::string& device_type);
+
+  static void SynchronizeDevice(const Place& place);
+
+  static void InitDevice(const Place& place);
+
+  static void DeInitDevice(const Place& place);
+
+  static void SetDevice(const std::string& device_type, size_t device_id);
+
+  static void SetDevice(const Place& place);
+
+  static int GetDevice(const std::string& device_type);
+
+  static size_t GetMinChunkSize(const Place& place);
+
+  static size_t GetMaxChunkSize(const Place& place);
+
+  static size_t GetMaxAllocSize(const Place& place);
+
+  static size_t GetInitAllocSize(const Place& place);
+
+  static size_t GetReallocSize(const Place& place);
+
+  static size_t GetExtraPaddingSize(const Place& place);
+
+  static void MemoryStats(const Place& place, size_t* total, size_t* free);
+
+  static size_t GetDeviceCount(const std::string& device_type);
+
+  static std::vector<size_t> GetDeviceList(const std::string& device_type);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(DeviceManager);
+  DeviceManager() {}
+  static DeviceManager& Instance();
+  static DeviceInterface* GetDeviceInterfaceWithType(
+      const std::string& device_type);
+
+  std::unordered_map<std::string, std::unique_ptr<DeviceInterface>>
+      device_impl_map_;
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Device>>>
+      device_map_;
+};
+
+bool LoadCustomRuntimeLib(void* dso_handle);
+
+bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
+                          std::unique_ptr<C_DeviceInterface> device_interface,
+                          void* dso_handle);
+
+bool LoadCustomDevice(const std::string& library_path);
+
+class Registrar {
+ public:
+  template <typename DeviceT>
+  explicit Registrar(DeviceT* device_ptr) {
+    DeviceManager::Register(std::unique_ptr<DeviceT>(device_ptr));
+  }
+
+  void Touch() {}
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index 4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8..ba3461d8c14871561b2d069f9350698306e22366 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -38,3 +38,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#endif
diff --git a/paddle/fluid/platform/device/event.cc b/paddle/fluid/platform/device/event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e6316ea16de020801a7afce6ad47f4b06eca022
--- /dev/null
+++ b/paddle/fluid/platform/device/event.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+namespace event {
+
+event_t Event::raw_event() const { return event_; }
+
+void Event::set_event(event_t event) { event_ = event; }
+
+Event::Event(const Place& place, event_t event)
+    : place_(place),
+      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      event_(event),
+      own_data_(false) {}
+
+Event::~Event() { Destroy(); }
+
+bool Event::Init(const Place& place, Flag flags) {
+  place_ = place;
+  DeviceGuard guard(place_);
+  device_->CreateEvent(this, flags);
+  VLOG(3) << "Init Event: " << event_ << ", place: " << place_
+          << ", flag:" << static_cast<int>(flags);
+  own_data_ = true;
+  return true;
+}
+
+void Event::Destroy() {
+  if (own_data_) {
+    DeviceGuard guard(place_);
+    device_->DestroyEvent(this);
+    own_data_ = false;
+  }
+}
+
+void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
+
+bool Event::Query() const { return device_->QueryEvent(this); }
+
+void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
+
+const Place& Event::GetPlace() const { return place_; }
+
+}  // namespace event
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/event.h b/paddle/fluid/platform/device/event.h
new file mode 100644
index 0000000000000000000000000000000000000000..376d73eb66660fdcdc0b2412d5d5e1371145e634
--- /dev/null
+++ b/paddle/fluid/platform/device/event.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+class Device;
+
+namespace stream {
+class Stream;
+}  // namespace stream
+
+namespace event {
+using event_t = void*;
+
+class Event {
+ public:
+  enum Flag {
+    Default = 0x0,
+    BlockingSync = 0x1,
+    DisableTiming = 0x2,
+    Interprocess = 0x4,
+  };
+
+  // For compatible
+  Event(const Place& place, event_t event);
+  ~Event();
+  event_t raw_event() const;
+  void set_event(event_t event);
+  bool Init(const Place& place, Flag flags = Flag::Default);
+  void Destroy();
+  void Record(const stream::Stream* stream);
+  bool Query() const;
+  void Synchonrize() const;
+  const Place& GetPlace() const;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Event);
+  Place place_;
+  Device* device_;
+  event_t event_;
+  bool own_data_ = true;
+};
+}  // namespace event
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index cd78a89088cc612c3fb43e489cfb7ef2e07cfcf3..58a25ae8d0e565b649b29863637fa9d000d524d3 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -61,6 +63,19 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                   static_cast<unsigned>(delta), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
+                                                        bfloat16 val, int delta,
+                                                        int width) {
+#if defined(PADDLE_CUDA_BF16)
+  return bfloat16(__shfl_down_sync(mask, static_cast<nv_bfloat16>(val),
+                                   static_cast<unsigned>(delta), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
     unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 13ffc2396946c5819c9276cf474d96a8057c4094..63897bd6717408bff4bd4db5e739b3ba64316350 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -59,6 +60,14 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                              static_cast<unsigned>(delta), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
+                                                        bfloat16 val, int delta,
+                                                        int width) {
+  return bfloat16(__shfl_down(static_cast<float>(val),
+                              static_cast<unsigned>(delta), width));
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
     unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/fluid/platform/device/stream.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f867e5ee7737d45f26a1967a3112c7075843454
--- /dev/null
+++ b/paddle/fluid/platform/device/stream.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/event.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+Stream::~Stream() { Destroy(); }
+
+const stream_t& Stream::raw_stream() const { return stream_; }
+
+void Stream::set_stream(stream_t stream) { stream_ = stream; }
+
+// For compatiable
+Stream::Stream(const Place& place, stream_t stream)
+    : place_(place),
+      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      stream_(stream),
+      callback_manager_(new CallbackManager(this)),
+      own_data_(false) {}
+
+bool Stream::Init(const Place& place, const Priority& priority,
+                  const Flag& flag) {
+  place_ = place;
+  device_ = platform::DeviceManager::GetDeviceWithPlace(place);
+  DeviceGuard guard(place_);
+  device_->CreateStream(this, priority, flag);
+
+  callback_manager_.reset(new CallbackManager(this));
+  VLOG(3) << "Init Stream: " << stream_ << ", place: " << place_
+          << ", priority: " << static_cast<int>(priority)
+          << ", flag:" << static_cast<int>(flag);
+  own_data_ = true;
+  return true;
+}
+
+void Stream::RecordEvent(event::Event* event, Callback callback) const {
+  callback();
+  device_->RecordEvent(event, this);
+}
+
+void Stream::RecordEvent(event::Event* event) const {
+  device_->RecordEvent(event, this);
+}
+
+void Stream::WaitEvent(event::Event* event) const {
+  device_->StreamWaitEvent(this, event);
+}
+
+void Stream::Wait() const {
+#if !defined(_WIN32)
+  device_->SynchronizeStream(this);
+#else
+  while (1) {
+    if (device_->QueryStream(this)) {
+      break;
+    }
+  }
+#endif
+}
+
+void Stream::WaitCallback() const { callback_manager_->Wait(); }
+
+void Stream::Destroy() {
+  if (own_data_) {
+    DeviceGuard guard(place_);
+    device_->DestroyStream(this);
+    own_data_ = false;
+  }
+}
+
+bool Stream::Query() const { return device_->QueryStream(this); }
+
+void Stream::Synchronize() const { device_->SynchronizeStream(this); }
+
+const Place& Stream::GetPlace() const { return place_; }
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/stream.h b/paddle/fluid/platform/device/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..25cf705ee0951847bfda84b336d3579403e8ab37
--- /dev/null
+++ b/paddle/fluid/platform/device/stream.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+class Device;
+
+namespace event {
+class Event;
+}  // namespace event
+
+namespace stream {
+using stream_t = void*;
+class Stream {
+ public:
+  enum class Priority : uint8_t {
+    kNull = 0x0,
+    kHigh = 0x1,
+    kNormal = 0x2,
+  };
+
+  enum class Flag : uint8_t {
+    kDefaultFlag = 0x0,
+    kStreamNonBlocking = 0x1,
+  };
+
+  using Callback = std::function<void()>;
+
+  Stream() = default;
+  // For compatiable
+  Stream(const Place& place, stream_t stream);
+  ~Stream();
+  const stream_t& raw_stream() const;
+  void set_stream(stream_t stream);
+  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+            const Flag& flag = Flag::kDefaultFlag);
+  template <typename Callback>
+  void AddCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+  void RecordEvent(event::Event* event, Callback callback) const;
+  void RecordEvent(event::Event* event) const;
+  void WaitEvent(event::Event* event) const;
+  void Wait() const;
+  void WaitCallback() const;
+  void Destroy();
+  bool Query() const;
+  void Synchronize() const;
+  const Place& GetPlace() const;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Stream);
+  Place place_;
+  Device* device_;
+  stream_t stream_;
+  std::unique_ptr<CallbackManager> callback_manager_;
+  bool own_data_ = true;
+};
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0a853a2f059745b281d3651d39baf061edf1053..d448df0702aadd56157902b55b11c41496bcf484 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool(
           "NPUPinnedPlace is not supported. Please re-compile with "
           "WITH_ASCEND_CL "
           "option."));
+#endif
+    } else if (platform::is_custom_place(p)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      EmplaceDeviceContext<CustomDeviceContext>(&device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CustomPlace is not supported. Please re-compile with "
+          "WITH_CUSTOM_DEVICE "
+          "option."));
 #endif
     }
   }
@@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
   return key_it->second;
 }
 
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) {
+  DeviceGuard guard(place_);
+  stream_.reset(new stream::Stream());
+  stream_->Init(place_);
+}
+
+CustomDeviceContext::~CustomDeviceContext() {}
+
+const Place& CustomDeviceContext::GetPlace() const { return place_; }
+
+void CustomDeviceContext::Wait() const {
+  // platform::RecordEvent record_event("NPUDeviceContext/wait");
+  VLOG(4) << "CustomDevice context(" << this << ")  Wait";
+  stream_->Wait();
+}
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 80dcf6d2ec23cea4f375f54d5d9f1b6e24f382cb..1d51383f6833b584f77bce9e865ad5d229590421 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -70,6 +70,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
+
+#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/fluid/platform/device/stream.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomDeviceContext : public DeviceContext {
+ public:
+  explicit CustomDeviceContext(CustomPlace place);
+  virtual ~CustomDeviceContext();
+
+  const Place& GetPlace() const override;
+  void Wait() const override;
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  C_Stream stream() const {
+    return reinterpret_cast<C_Stream>(stream_->raw_stream());
+  }
+
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return stream_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+ private:
+  std::string device_type_;
+
+  CustomPlace place_;
+
+  std::shared_ptr<platform::stream::Stream> stream_;
+
+  CustomDeviceContext();
+  DISABLE_COPY_AND_ASSIGN(CustomDeviceContext);
+};
+template <>
+struct DefaultDeviceContextType<platform::CustomPlace> {
+  using TYPE = CustomDeviceContext;
+};
+#else
+template <>
+struct DefaultDeviceContextType<platform::CustomPlace> {
+  using TYPE = DeviceContext;
+};
+#endif
+
 /*! \brief device context pool singleton */
 class DeviceContextPool {
  public:
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index b969ba971b6b1ec2ca1ad6e8c0c28fdf07bb6431..39f95a9295661b2b3432d7ca062b2bdb1fe5c40a 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
 
 /**
  * Memory related FLAG
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index f7a86e5aac765c68e3f11e8adcfdf1c9a75aba7c..5d0fccf9e9d4188e66ac54213271ac7cb10d019e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cupti.h"
 #endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -234,6 +235,19 @@ void InitDevices(const std::vector<int> devices) {
     if (!custom_kernel_root.empty()) {
       LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
       framework::LoadCustomKernel(custom_kernel_root);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (platform::LoadCustomDevice(custom_kernel_root)) {
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (auto &dev_type : device_types) {
+          VLOG(1) << "Device type: " << dev_type << ", visible devices count: "
+                  << platform::DeviceManager::GetDeviceCount(dev_type);
+          for (size_t i = 0;
+               i < platform::DeviceManager::GetDeviceCount(dev_type); i++) {
+            places.push_back(platform::CustomPlace(dev_type, i));
+          }
+        }
+      }
+#endif
     } else {
       VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
     }
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index e73e3736f64b462f03e6cda1e6212fcfe55c9939..b73e2e398f270646b19cca06274e549a4a4b62ba 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) {
   return p.GetType() == pten::AllocationType::NPUPINNED;
 }
 
+bool is_custom_place(const Place &p) {
+  return p.GetType() == pten::AllocationType::CUSTOM;
+}
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (is_custom_place(p1) && is_custom_place(p2)) {
+    return p1.GetDeviceType() == p2.GetDeviceType();
+  }
+#endif
   return p1.GetType() == p2.GetType();
 }
 
@@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return p1 == p2;
     } else if (is_ipu_place(p1)) {
       return p1 == p2;
+    } else if (is_custom_place(p1)) {
+      return p1 == p2;
     } else {
       return p1 == p2;
     }
@@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) {
   }
 }
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+std::string PlaceHelper::GetDeviceType(const Place &place) {
+  if (is_cpu_place(place)) {
+    return "cpu";
+  } else if (is_gpu_place(place)) {
+    return "gpu";
+  } else if (is_npu_place(place)) {
+    return "npu";
+  } else if (is_xpu_place(place)) {
+    return "xpu";
+  } else if (is_custom_place(place)) {
+    return place.GetDeviceType();
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Unknown device type. Please check available devices by "
+        "paddle.device.get_available_device()"));
+  }
+}
+
+size_t PlaceHelper::GetDeviceId(const Place &place) {
+  return place.GetDeviceId();
+}
+
+Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) {
+  if (dev_type == "cpu") {
+    return platform::CPUPlace();
+  } else if (dev_type == "gpu") {
+    return platform::CUDAPlace(dev_id);
+  } else if (dev_type == "npu") {
+    return platform::NPUPlace(dev_id);
+  } else if (dev_type == "xpu") {
+    return platform::XPUPlace(dev_id);
+  } else {
+    return platform::CustomPlace(dev_type, dev_id);
+  }
+}
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 80bbeac251810b6d32167433292fc55c3105234e..278bfad003cd444143fc98f3f8382687073cc483 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace;
 using XPUPlace = pten::XPUPlace;
 using IPUPlace = pten::IPUPlace;
 using MLUPlace = pten::MLUPlace;
+using CustomPlace = pten::CustomPlace;
 
 using PlaceList = std::vector<Place>;
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class PlaceHelper {
+ public:
+  static std::string GetDeviceType(const Place &place);
+  static size_t GetDeviceId(const Place &place);
+  static Place CreatePlace(const std::string &dev_type, size_t dev_id = 0);
+};
+#endif
+
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
@@ -47,6 +57,7 @@ bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
+bool is_custom_place(const Place &p);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Paddle is not compiled with MLU. Cannot visit mlu device"));
+#endif
+    }
+    case pten::AllocationType::CUSTOM: {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      platform::CustomPlace p(place.GetDeviceType(), place.GetDeviceId());
+      return visitor(p);
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with CUSTOM. Cannot visit custom device"));
 #endif
     }
     default: {
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 626847f04653cae1acec7dc06d594700aa5d1d70..320e989bd9bb1881e7f1ad0d6d5506fb6e313e24 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -5,3 +5,4 @@ cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
 cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
+add_subdirectory(dump)
diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e25333f7a8a73864137a85bc64fe28506b86e081
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
@@ -0,0 +1,4 @@
+proto_library(nodetreeproto SRCS nodetree.proto)
+cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
+cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
+cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS  serialization_logger deserialization_reader event_node)
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1049a7dc190845dc91013f688a27224f5e26b0e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -0,0 +1,218 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+
+#include <cstring>
+
+namespace paddle {
+namespace platform {
+
+DeserializationReader::DeserializationReader(const std::string& filename)
+    : filename_(filename) {
+  OpenFile();
+  node_trees_proto_ = new NodeTreesProto();
+}
+
+DeserializationReader::DeserializationReader(const char* filename)
+    : filename_(filename) {
+  OpenFile();
+  node_trees_proto_ = new NodeTreesProto();
+}
+
+void DeserializationReader::OpenFile() {
+  input_file_stream_.open(filename_, std::ifstream::in | std::ifstream::binary);
+  if (!input_file_stream_) {
+    VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
+  } else {
+    VLOG(0) << "Read profiling data from " << filename_ << std::endl;
+  }
+}
+
+std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
+  if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
+    VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
+    return nullptr;
+  }
+  std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
+  for (int node_tree_index = 0;
+       node_tree_index < node_trees_proto_->thread_trees_size();
+       node_tree_index++) {
+    // handle one thread tree
+    std::map<int64_t, HostTraceEventNode*> index_node_map;
+    std::map<int64_t, int64_t> child_parent_map;
+    const ThreadNodeTreeProto& thread_node_tree_proto =
+        node_trees_proto_->thread_trees(node_tree_index);
+    uint64_t current_threadid = thread_node_tree_proto.thread_id();
+    for (int host_node_index = 0;
+         host_node_index < thread_node_tree_proto.host_nodes_size();
+         host_node_index++) {
+      // handle host node
+      const HostTraceEventNodeProto& host_node_proto =
+          thread_node_tree_proto.host_nodes(host_node_index);
+      HostTraceEventNode* host_node =
+          RestoreHostTraceEventNode(host_node_proto);
+      index_node_map[host_node_proto.id()] = host_node;
+      child_parent_map[host_node_proto.id()] = host_node_proto.parentid();
+      // handle runtime node
+      for (int runtime_node_index = 0;
+           runtime_node_index < host_node_proto.runtime_nodes_size();
+           runtime_node_index++) {
+        const CudaRuntimeTraceEventNodeProto& runtime_node_proto =
+            host_node_proto.runtime_nodes(runtime_node_index);
+        CudaRuntimeTraceEventNode* runtime_node =
+            RestoreCudaRuntimeTraceEventNode(runtime_node_proto);
+        host_node->AddCudaRuntimeNode(runtime_node);  // insert into host_node
+        // handle device node
+        for (int device_node_index = 0;
+             device_node_index < runtime_node_proto.device_nodes_size();
+             device_node_index++) {
+          const DeviceTraceEventNodeProto& device_node_proto =
+              runtime_node_proto.device_nodes(device_node_index);
+          DeviceTraceEventNode* device_node =
+              RestoreDeviceTraceEventNode(device_node_proto);
+          runtime_node->AddDeviceTraceEventNode(
+              device_node);  // insert into runtime_node
+        }
+      }
+    }
+    // restore parent-child relationship
+    for (auto it = child_parent_map.begin(); it != child_parent_map.end();
+         it++) {
+      if (it->second != -1) {  // not root node
+        index_node_map[it->second]->AddChild(index_node_map[it->first]);
+      } else {
+        thread_event_trees_map[current_threadid] =
+            index_node_map[it->first];  // root node
+      }
+    }
+  }
+  // restore NodeTrees object
+  return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
+}
+
+DeserializationReader::~DeserializationReader() {
+  delete node_trees_proto_;
+  input_file_stream_.close();
+}
+
+DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
+    const DeviceTraceEventNodeProto& device_node_proto) {
+  const DeviceTraceEventProto& device_event_proto =
+      device_node_proto.device_event();
+  DeviceTraceEvent device_event;
+  device_event.name = device_event_proto.name();
+  device_event.type = static_cast<TracerEventType>(device_event_proto.type());
+  device_event.start_ns = device_event_proto.start_ns();
+  device_event.end_ns = device_event_proto.end_ns();
+  device_event.device_id = device_event_proto.device_id();
+  device_event.context_id = device_event_proto.context_id();
+  device_event.stream_id = device_event_proto.stream_id();
+  device_event.correlation_id = device_event_proto.correlation_id();
+  switch (device_event.type) {
+    case TracerEventType::Kernel:
+      device_event.kernel_info = HandleKernelEventInfoProto(device_event_proto);
+      break;
+
+    case TracerEventType::Memcpy:
+      device_event.memcpy_info = HandleMemcpyEventInfoProto(device_event_proto);
+      break;
+
+    case TracerEventType::Memset:
+      device_event.memset_info = HandleMemsetEventInfoProto(device_event_proto);
+      break;
+    default:
+      break;
+  }
+  return new DeviceTraceEventNode(device_event);
+}
+
+CudaRuntimeTraceEventNode*
+DeserializationReader::RestoreCudaRuntimeTraceEventNode(
+    const CudaRuntimeTraceEventNodeProto& runtime_node_proto) {
+  const CudaRuntimeTraceEventProto& runtime_event_proto =
+      runtime_node_proto.runtime_trace_event();
+  RuntimeTraceEvent runtime_event;
+  runtime_event.name = runtime_event_proto.name();
+  runtime_event.start_ns = runtime_event_proto.start_ns();
+  runtime_event.end_ns = runtime_event_proto.end_ns();
+  runtime_event.process_id = runtime_event_proto.process_id();
+  runtime_event.thread_id = runtime_event_proto.thread_id();
+  runtime_event.correlation_id = runtime_event_proto.correlation_id();
+  runtime_event.callback_id = runtime_event_proto.callback_id();
+  return new CudaRuntimeTraceEventNode(runtime_event);
+}
+
+HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode(
+    const HostTraceEventNodeProto& host_node_proto) {
+  const HostTraceEventProto& host_event_proto =
+      host_node_proto.host_trace_event();
+  HostTraceEvent host_event;
+  host_event.name = host_event_proto.name();
+  host_event.type = static_cast<TracerEventType>(host_event_proto.type());
+  host_event.start_ns = host_event_proto.start_ns();
+  host_event.end_ns = host_event_proto.end_ns();
+  host_event.process_id = host_event_proto.process_id();
+  host_event.thread_id = host_event_proto.thread_id();
+  return new HostTraceEventNode(host_event);
+}
+
+KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const KernelEventInfoProto& kernel_info_proto =
+      device_event_proto.kernel_info();
+  KernelEventInfo kernel_info;
+  kernel_info.block_x = kernel_info_proto.block_x();
+  kernel_info.block_y = kernel_info_proto.block_y();
+  kernel_info.block_z = kernel_info_proto.block_z();
+  kernel_info.grid_x = kernel_info_proto.grid_x();
+  kernel_info.grid_y = kernel_info_proto.grid_y();
+  kernel_info.grid_z = kernel_info_proto.grid_z();
+  kernel_info.dynamic_shared_memory = kernel_info_proto.dynamic_shared_memory();
+  kernel_info.static_shared_memory = kernel_info_proto.static_shared_memory();
+  kernel_info.registers_per_thread = kernel_info_proto.registers_per_thread();
+  kernel_info.local_memory_per_thread =
+      kernel_info_proto.local_memory_per_thread();
+  kernel_info.local_memory_total = kernel_info_proto.local_memory_total();
+  kernel_info.queued = kernel_info_proto.queued();
+  kernel_info.submitted = kernel_info_proto.submitted();
+  kernel_info.completed = kernel_info_proto.completed();
+  return kernel_info;
+}
+
+MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const MemcpyEventInfoProto& memcpy_info_proto =
+      device_event_proto.memcpy_info();
+  MemcpyEventInfo memcpy_info;
+  memcpy_info.num_bytes = memcpy_info_proto.num_bytes();
+  std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(),
+               kMemKindMaxLen - 1);
+  std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(),
+               kMemKindMaxLen - 1);
+  std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(),
+               kMemKindMaxLen - 1);
+  return memcpy_info;
+}
+
+MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const MemsetEventInfoProto& memset_info_proto =
+      device_event_proto.memset_info();
+  MemsetEventInfo memset_info;
+  memset_info.num_bytes = memset_info_proto.num_bytes();
+  std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(),
+               kMemKindMaxLen - 1);
+  memset_info.value = memset_info_proto.value();
+  return memset_info;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ad2dabf229ad5665db6cc9f9ec43470f0b232f3
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+
+namespace paddle {
+namespace platform {
+
+class DeserializationReader {
+ public:
+  explicit DeserializationReader(const std::string& filename);
+  explicit DeserializationReader(const char* filename);
+  ~DeserializationReader();
+  std::unique_ptr<NodeTrees> Parse();
+
+ private:
+  void OpenFile();
+  DeviceTraceEventNode* RestoreDeviceTraceEventNode(
+      const DeviceTraceEventNodeProto&);
+  CudaRuntimeTraceEventNode* RestoreCudaRuntimeTraceEventNode(
+      const CudaRuntimeTraceEventNodeProto&);
+  HostTraceEventNode* RestoreHostTraceEventNode(const HostTraceEventNodeProto&);
+  KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&);
+  MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&);
+  MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&);
+  std::string filename_;
+  std::ifstream input_file_stream_;
+  NodeTreesProto* node_trees_proto_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
new file mode 100644
index 0000000000000000000000000000000000000000..37dac0e597ce208da05271ff88c6f28b3c9dd9f9
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.platform;
+
+enum TracerEventTypeProto {
+  // Used to mark operator record
+  Operator = 0;
+  // Used to mark dataloader record
+  Dataloader = 1;
+  // Used to mark profile step record
+  ProfileStep = 2;
+  // Used to mark cuda runtime record returned by cupti
+  CudaRuntime = 3;
+  // Used to mark kernel computation record returned by cupti
+  Kernel = 4;
+  // Used to mark memcpy record returned by cupti
+  Memcpy = 5;
+  // Used to mark memset record returned by cupti
+  Memset = 6;
+  // Used to mark record defined by user
+  UserDefined = 7;
+  // A flag to denote the number of current types
+  NumTypes = 8;
+}
+
+message KernelEventInfoProto {
+  // The X-dimension block size for the kernel.
+  required uint32 block_x = 1;
+  // The Y-dimension block size for the kernel.
+  required uint32 block_y = 2;
+  // The Z-dimension grid size for the kernel.
+  required uint32 block_z = 3;
+  // X-dimension of a grid.
+  required uint32 grid_x = 4;
+  // Y-dimension of a grid.
+  required uint32 grid_y = 5;
+  // Z-dimension of a grid.
+  required uint32 grid_z = 6;
+  // The dynamic shared memory reserved for the kernel, in bytes.
+  required uint32 dynamic_shared_memory = 7;
+  // The static shared memory allocated for the kernel, in bytes.
+  required uint32 static_shared_memory = 8;
+  // The number of registers required for each thread executing the kernel.
+  required uint32 registers_per_thread = 9;
+  // The amount of local memory reserved for each thread, in bytes.
+  required uint32 local_memory_per_thread = 10;
+  // The total amount of local memory reserved for the kernel, in bytes.
+  required uint32 local_memory_total = 11;
+  // The timestamp when the kernel is queued up in the command buffer, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  required uint64 queued = 12;
+  // The timestamp when the command buffer containing the kernel launch is
+  // submitted to the GPU, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  required uint64 submitted = 13;
+  // The completed timestamp for the kernel execution, in ns.
+  required uint64 completed = 14;
+}
+
+message MemcpyEventInfoProto {
+  // The number of bytes transferred by the memory copy.
+  required uint64 num_bytes = 1;
+  // The kind of the memory copy.
+  // Each kind represents the source and destination targets of a memory copy.
+  // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
+  required string copy_kind = 2;
+  // The source memory kind read by the memory copy.
+  // Each kind represents the type of the memory accessed by a memory
+  // operation/copy. Refer to CUpti_ActivityMemoryKind
+  required string src_kind = 3;
+  // The destination memory kind read by the memory copy.
+  required string dst_kind = 4;
+}
+
+message MemsetEventInfoProto {
+  // The number of bytes being set by the memory set.
+  required uint64 num_bytes = 1;
+  // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
+  required string memory_kind = 2;
+  // the value being assigned to memory by the memory set.
+  required uint32 value = 3;
+}
+
+message HostTraceEventProto {
+  required string name = 1;
+  required TracerEventTypeProto type = 2;
+  // start timestamp of the record
+  required uint64 start_ns = 3;
+  // end timestamp of the record
+  required uint64 end_ns = 4;
+  // process id of the record
+  required uint64 process_id = 5;
+  // thread id of the record
+  required uint64 thread_id = 6;
+}
+
+message CudaRuntimeTraceEventProto {
+  // record name
+  required string name = 1;
+  // start timestamp of the record
+  required uint64 start_ns = 2;
+  // end timestamp of the record
+  required uint64 end_ns = 3;
+  // process id of the record
+  required uint64 process_id = 4;
+  // thread id of the record
+  required uint64 thread_id = 5;
+  // correlation id, used for correlating async activities happened on device
+  required uint32 correlation_id = 6;
+  // callback id, used to identify which cuda runtime api is called
+  required uint32 callback_id = 7;
+}
+
+message DeviceTraceEventProto {
+  // record name
+  required string name = 1;
+  // record type, one of TracerEventType
+  required TracerEventTypeProto type = 2;
+  // start timestamp of the record
+  required uint64 start_ns = 3;
+  // end timestamp of the record
+  required uint64 end_ns = 4;
+  // device id
+  required uint64 device_id = 5;
+  // context id
+  required uint64 context_id = 6;
+  // stream id
+  required uint64 stream_id = 7;
+  // correlation id, used for correlating async activities happened on device
+  required uint32 correlation_id = 8;
+  // union, specific device record type has different detail information
+  oneof detail_info {
+    // used for TracerEventType::Kernel
+    KernelEventInfoProto kernel_info = 9;
+    // used for TracerEventType::Memcpy
+    MemcpyEventInfoProto memcpy_info = 10;
+    // used for TracerEventType::Memset
+    MemsetEventInfoProto memset_info = 11;
+  }
+}
+
+message DeviceTraceEventNodeProto {
+  required DeviceTraceEventProto device_event = 1;
+}
+
+message CudaRuntimeTraceEventNodeProto {
+  required CudaRuntimeTraceEventProto runtime_trace_event = 1;
+  repeated DeviceTraceEventNodeProto device_nodes = 2;
+}
+
+message HostTraceEventNodeProto {
+  required int64 id = 1;
+  required int64 parentid = 2;
+  required HostTraceEventProto host_trace_event = 3;
+  repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
+}
+
+message ThreadNodeTreeProto {
+  required uint64 thread_id = 1;
+  repeated HostTraceEventNodeProto host_nodes = 2;
+}
+
+message NodeTreesProto {
+  required string version = 1;
+  repeated ThreadNodeTreeProto thread_trees = 2;
+}
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9ed84bd438a7e2ac95a6637b6efcae870a8ad75
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -0,0 +1,265 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/utils.h"
+
+namespace paddle {
+namespace platform {
+
+static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
+static const char* version = "1.0.0";
+
+static std::string DefaultFileName() {
+  auto pid = GetProcessId();
+  return string_format(std::string(kDefaultFilename), pid,
+                       GetStringFormatLocalTime().c_str());
+}
+
+void SerializationLogger::OpenFile() {
+  output_file_stream_.open(filename_, std::ofstream::out |
+                                          std::ofstream::trunc |
+                                          std::ofstream::binary);
+  if (!output_file_stream_) {
+    LOG(WARNING) << "Unable to open file for writing profiling data."
+                 << std::endl;
+  } else {
+    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
+  }
+  node_trees_proto_ = new NodeTreesProto();
+  node_trees_proto_->set_version(std::string(version));
+}
+
+void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
+  // dump the whole tree into file
+  const std::map<uint64_t, std::vector<HostTraceEventNode*>>
+      thread2host_event_nodes = node_trees.Traverse(true);
+
+  for (auto it = thread2host_event_nodes.begin();
+       it != thread2host_event_nodes.end(); ++it) {
+    // 1. order every node an index, every node a parent
+    std::map<HostTraceEventNode*, int64_t> node_index_map;
+    std::map<HostTraceEventNode*, int64_t> node_parent_map;
+    int64_t index = 0;
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      node_index_map[(*hostnode)] = index;  // order each node
+      index++;
+    }
+    node_parent_map[(*(it->second.begin()))] = -1;  // root's parent set as -1
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      for (auto childnode = (*hostnode)->GetChildren().begin();
+           childnode != (*hostnode)->GetChildren().end(); ++childnode) {
+        node_parent_map[(*childnode)] =
+            node_index_map[(*hostnode)];  // mark each node's parent
+      }
+    }
+
+    // 2. serialize host node, runtime node and device node
+    current_thread_node_tree_proto_ =
+        node_trees_proto_->add_thread_trees();  // add ThreadNodeTreeProto
+    current_thread_node_tree_proto_->set_thread_id(it->first);
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      HostTraceEventNodeProto* host_node_proto =
+          current_thread_node_tree_proto_
+              ->add_host_nodes();  // add HostTraceEventNodeProto
+      host_node_proto->set_id(node_index_map[(*hostnode)]);
+      host_node_proto->set_parentid(node_parent_map[(*hostnode)]);
+      current_host_trace_event_node_proto_ =
+          host_node_proto;       // set current HostTraceEventNodeProto
+      (*hostnode)->LogMe(this);  // fill detail information
+
+      for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin();
+           runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end();
+           ++runtimenode) {
+        CudaRuntimeTraceEventNodeProto* runtime_node_proto =
+            current_host_trace_event_node_proto_
+                ->add_runtime_nodes();  // add CudaRuntimeTraceEventNodeProto
+        current_runtime_trace_event_node_proto_ =
+            runtime_node_proto;  // set current CudaRuntimeTraceEventNodeProto
+        (*runtimenode)->LogMe(this);  // fill detail information
+        for (auto devicenode =
+                 (*runtimenode)->GetDeviceTraceEventNodes().begin();
+             devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
+             ++devicenode) {
+          DeviceTraceEventNodeProto* device_node_proto =
+              current_runtime_trace_event_node_proto_
+                  ->add_device_nodes();  // add DeviceTraceEventNodeProto
+          current_device_trace_event_node_proto_ =
+              device_node_proto;       // set current DeviceTraceEventNodeProto
+          (*devicenode)->LogMe(this);  // fill detail information
+        }
+      }
+    }
+  }
+}
+
+void SerializationLogger::LogHostTraceEventNode(
+    const HostTraceEventNode& host_node) {
+  HostTraceEventProto* host_trace_event = new HostTraceEventProto();
+  host_trace_event->set_name(host_node.Name());
+  host_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(host_node.Type()));
+  host_trace_event->set_start_ns(host_node.StartNs());
+  host_trace_event->set_end_ns(host_node.EndNs());
+  host_trace_event->set_process_id(host_node.ProcessId());
+  host_trace_event->set_thread_id(host_node.ThreadId());
+  current_host_trace_event_node_proto_->set_allocated_host_trace_event(
+      host_trace_event);
+}
+
+void SerializationLogger::LogRuntimeTraceEventNode(
+    const CudaRuntimeTraceEventNode& runtime_node) {
+  CudaRuntimeTraceEventProto* runtime_trace_event =
+      new CudaRuntimeTraceEventProto();
+  runtime_trace_event->set_name(runtime_node.Name());
+  runtime_trace_event->set_start_ns(runtime_node.StartNs());
+  runtime_trace_event->set_end_ns(runtime_node.EndNs());
+  runtime_trace_event->set_process_id(runtime_node.ProcessId());
+  runtime_trace_event->set_thread_id(runtime_node.ThreadId());
+  runtime_trace_event->set_correlation_id(runtime_node.CorrelationId());
+  runtime_trace_event->set_callback_id(runtime_node.CallbackId());
+  current_runtime_trace_event_node_proto_->set_allocated_runtime_trace_event(
+      runtime_trace_event);
+}
+
+void SerializationLogger::LogDeviceTraceEventNode(
+    const DeviceTraceEventNode& device_node) {
+  switch (device_node.Type()) {
+    case TracerEventType::Kernel:
+      HandleTypeKernel(device_node);
+      break;
+    case TracerEventType::Memcpy:
+      HandleTypeMemcpy(device_node);
+      break;
+    case TracerEventType::Memset:
+      HandleTypeMemset(device_node);
+      break;
+    default:
+      break;
+  }
+}
+
+void SerializationLogger::HandleTypeKernel(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  KernelEventInfoProto* kernel_info = new KernelEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill KernelEventInfoProto
+  KernelEventInfo info = device_node.KernelInfo();
+  kernel_info->set_block_x(info.block_x);
+  kernel_info->set_block_y(info.block_y);
+  kernel_info->set_block_z(info.block_z);
+  kernel_info->set_grid_x(info.grid_x);
+  kernel_info->set_grid_y(info.grid_y);
+  kernel_info->set_grid_z(info.grid_z);
+  kernel_info->set_dynamic_shared_memory(info.dynamic_shared_memory);
+  kernel_info->set_static_shared_memory(info.static_shared_memory);
+  kernel_info->set_registers_per_thread(info.registers_per_thread);
+  kernel_info->set_local_memory_per_thread(info.local_memory_per_thread);
+  kernel_info->set_local_memory_total(info.local_memory_total);
+  kernel_info->set_queued(info.queued);
+  kernel_info->set_submitted(info.submitted);
+  kernel_info->set_completed(info.completed);
+  // binding
+  device_trace_event->set_allocated_kernel_info(kernel_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+void SerializationLogger::HandleTypeMemcpy(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  MemcpyEventInfoProto* memcpy_info = new MemcpyEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill MemcpyEventInfoProto
+  MemcpyEventInfo info = device_node.MemcpyInfo();
+  memcpy_info->set_num_bytes(info.num_bytes);
+  memcpy_info->set_copy_kind(std::string(info.copy_kind));
+  memcpy_info->set_src_kind(std::string(info.src_kind));
+  memcpy_info->set_dst_kind(std::string(info.dst_kind));
+  // binding
+  device_trace_event->set_allocated_memcpy_info(memcpy_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+void SerializationLogger::HandleTypeMemset(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  MemsetEventInfoProto* memset_info = new MemsetEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill MemsetEventInfoProto
+  MemsetEventInfo info = device_node.MemsetInfo();
+  memset_info->set_num_bytes(info.num_bytes);
+  memset_info->set_memory_kind(std::string(info.memory_kind));
+  memset_info->set_value(info.value);
+  // binding
+  device_trace_event->set_allocated_memset_info(memset_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+SerializationLogger::SerializationLogger(const std::string& filename) {
+  filename_ = filename.empty() ? DefaultFileName() : filename;
+  OpenFile();
+}
+
+SerializationLogger::SerializationLogger(const char* filename_cstr) {
+  std::string filename(filename_cstr);
+  filename_ = filename.empty() ? DefaultFileName() : filename;
+  OpenFile();
+}
+
+SerializationLogger::~SerializationLogger() {
+  if (!output_file_stream_) {
+    delete node_trees_proto_;
+    return;
+  }
+  node_trees_proto_->SerializeToOstream(&output_file_stream_);
+  delete node_trees_proto_;
+  output_file_stream_.close();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
new file mode 100755
index 0000000000000000000000000000000000000000..1295be95d45316d6884b68b3115caefa7905d673
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
+#include "paddle/fluid/platform/profiler/output_logger.h"
+
+namespace paddle {
+namespace platform {
+
+// Dump a NodeTrees into a profobuf file.
+// A SerializationLogger object can only dump a NodeTrees object,
+// creates a file in the constructor and closes the file in the destructor.
+class SerializationLogger : public BaseLogger {
+ public:
+  explicit SerializationLogger(const std::string& filename);
+  explicit SerializationLogger(const char* filename);
+  ~SerializationLogger();
+  std::string filename() { return filename_; }
+  void LogDeviceTraceEventNode(const DeviceTraceEventNode&) override;
+  void LogHostTraceEventNode(const HostTraceEventNode&) override;
+  void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
+  void LogNodeTrees(const NodeTrees&) override;
+
+ private:
+  void OpenFile();
+  void HandleTypeKernel(const DeviceTraceEventNode&);
+  void HandleTypeMemset(const DeviceTraceEventNode&);
+  void HandleTypeMemcpy(const DeviceTraceEventNode&);
+  std::string filename_;
+  std::ofstream output_file_stream_;
+  NodeTreesProto* node_trees_proto_;
+  ThreadNodeTreeProto* current_thread_node_tree_proto_;
+  HostTraceEventNodeProto* current_host_trace_event_node_proto_;
+  CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_;
+  DeviceTraceEventNodeProto* current_device_trace_event_node_proto_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2fe9626ec76df5654d19e785d043311f5f00496e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+
+using paddle::platform::SerializationLogger;
+using paddle::platform::DeserializationReader;
+using paddle::platform::NodeTrees;
+using paddle::platform::HostTraceEventNode;
+using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeviceTraceEventNode;
+using paddle::platform::HostTraceEvent;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::DeviceTraceEvent;
+using paddle::platform::TracerEventType;
+using paddle::platform::KernelEventInfo;
+using paddle::platform::MemcpyEventInfo;
+using paddle::platform::MemsetEventInfo;
+
+TEST(SerializationLoggerTest, dump_case0) {
+  std::list<HostTraceEvent> host_events;
+  std::list<RuntimeTraceEvent> runtime_events;
+  std::list<DeviceTraceEvent> device_events;
+  host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
+                                       TracerEventType::Dataloader, 1000, 10000,
+                                       10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
+                                             17000, 10, 10, 1, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
+                                             35000, 10, 10, 2, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
+                                             37000, 10, 11, 3, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
+                                             19000, 10, 10, 4, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
+                                             39000, 10, 11, 5, 0));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
+                       55000, 0, 10, 10, 1, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
+                       95000, 0, 10, 10, 2, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
+                       65000, 0, 10, 11, 3, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
+                       59000, 0, 10, 10, 4, MemcpyEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
+                       69000, 0, 10, 11, 5, MemsetEventInfo()));
+  SerializationLogger logger("test_serialization_logger_case0.pb");
+  NodeTrees tree(host_events, runtime_events, device_events);
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree.Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 4u);
+  EXPECT_EQ(nodes[11].size(), 2u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+    }
+    if ((*it)->Name() == "op1") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "op3") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  tree.LogMe(&logger);
+}
+
+TEST(SerializationLoggerTest, dump_case1) {
+  std::list<HostTraceEvent> host_events;
+  std::list<RuntimeTraceEvent> runtime_events;
+  std::list<DeviceTraceEvent> device_events;
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
+                                             17000, 10, 10, 1, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
+                                             35000, 10, 10, 2, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
+                                             37000, 10, 11, 3, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
+                                             19000, 10, 10, 4, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
+                                             39000, 10, 11, 5, 0));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
+                       55000, 0, 10, 10, 1, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
+                       95000, 0, 10, 10, 2, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
+                       65000, 0, 10, 11, 3, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
+                       59000, 0, 10, 10, 4, MemcpyEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
+                       69000, 0, 10, 11, 5, MemsetEventInfo()));
+  SerializationLogger logger("test_serialization_logger_case1.pb");
+  NodeTrees tree(host_events, runtime_events, device_events);
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree.Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 1u);
+  EXPECT_EQ(nodes[11].size(), 1u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  tree.LogMe(&logger);
+}
+
+TEST(DeserializationReaderTest, restore_case0) {
+  DeserializationReader reader("test_serialization_logger_case0.pb");
+  std::unique_ptr<NodeTrees> tree = reader.Parse();
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree->Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 4u);
+  EXPECT_EQ(nodes[11].size(), 2u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+    }
+    if ((*it)->Name() == "op1") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "op3") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+}
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..04014b972c3e3599beef0a60635fa122a153233f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/platform/os_info.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename... Args>
+std::string string_format(const std::string& format, Args... args) {
+  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
+               1;  // Extra space for '\0'
+  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
+                                   "Error during profiler data formatting."));
+  auto size = static_cast<size_t>(size_s);
+  auto buf = std::make_unique<char[]>(size);
+  std::snprintf(buf.get(), size, format.c_str(), args...);
+  return std::string(buf.get(), size - 1);  // exclude the '\0'
+}
+
+static std::string GetStringFormatLocalTime() {
+  std::time_t rawtime;
+  std::tm* timeinfo;
+  char buf[100];
+  std::time(&rawtime);
+  timeinfo = std::localtime(&rawtime);
+  std::strftime(buf, 100, "%F-%X", timeinfo);
+  return std::string(buf);
+}
+
+static int64_t nsToUs(int64_t ns) { return ns / 1000; }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 265f0fba8f376e5c4e748415469f1b4caab1d4c4..b1fe9f99b5d428d735a6e6734ccd5d7d6faa74e8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -284,7 +284,7 @@ if(WITH_PYTHON)
 
   cc_library(paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
-    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
 
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 0422a9cf8cc0ad984621fe09ee28bb7d624897d6..7bb7f03983eb9e8c88f46174a40664f1110682d1 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -151,14 +151,9 @@ void BindFleetExecutor(py::module* m) {
       .def_readwrite("current_endpoint", &DistModelConfig::current_endpoint)
       .def_readwrite("nranks", &DistModelConfig::nranks)
       .def_readwrite("local_rank", &DistModelConfig::local_rank)
-      .def_readwrite("mp_degree", &DistModelConfig::mp_degree)
-      .def_readwrite("pp_degree", &DistModelConfig::pp_degree)
-      .def_readwrite("mp_ring_id", &DistModelConfig::mp_ring_id)
-      .def_readwrite("enable_timer", &DistModelConfig::enable_timer)
-      .def_readwrite("pp_upstream_ring_id",
-                     &DistModelConfig::pp_upstream_ring_id)
-      .def_readwrite("pp_downstream_ring_id",
-                     &DistModelConfig::pp_downstream_ring_id);
+      .def_readwrite("ring_id_to_ranks", &DistModelConfig::ring_id_to_ranks_)
+      .def_readwrite("rank_to_ring_ids", &DistModelConfig::rank_to_ring_ids_)
+      .def_readwrite("enable_timer", &DistModelConfig::enable_timer);
 
   py::class_<DistModel>(*m, "DistModel")
       .def(py::init<const DistModelConfig&>())
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index f4e5df800dadaa9774062f704fb93b7a0ac746a9..6e882b5e0e4b07dd67a6b59747d2a89a6cc59fb7 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -45,7 +45,7 @@ PyTypeObject* p_tensor_type;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 
-PyObject* EagerTensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
+PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   PyObject* obj = type->tp_alloc(type, 0);
   if (obj) {
     auto v = reinterpret_cast<TensorObject*>(obj);
@@ -56,14 +56,14 @@ PyObject* EagerTensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
 }
 
 // TODO(jiabin): Overload this once we need more constructor in Python
-void EmptyEagerTensorInitializer(
-    TensorObject* self, const std::string& name,
-    const paddle::platform::Place& place, bool persistable = false,
-    bool stop_gradient = true, framework::proto::VarType::Type dtype =
-                                   paddle::framework::proto::VarType::FP32,
-    const std::vector<int>& dims = {},
-    framework::proto::VarType::Type var_type =
-        paddle::framework::proto::VarType::LOD_TENSOR) {
+void EmptyTensorInitializer(TensorObject* self, const std::string& name,
+                            const paddle::platform::Place& place,
+                            bool persistable = false, bool stop_gradient = true,
+                            framework::proto::VarType::Type dtype =
+                                paddle::framework::proto::VarType::FP32,
+                            const std::vector<int>& dims = {},
+                            framework::proto::VarType::Type var_type =
+                                paddle::framework::proto::VarType::LOD_TENSOR) {
   auto ddims = paddle::framework::make_ddim(dims);
   PADDLE_ENFORCE_GE(
       paddle::framework::product(ddims), 0,
@@ -98,46 +98,41 @@ void EmptyEagerTensorInitializer(
   }
 }
 
-void InitEagerTensorWithNumpyValue(TensorObject* self, const py::object& array,
-                                   bool zero_copy = false) {
+void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
+                              bool zero_copy = false) {
   PADDLE_ENFORCE_EQ(
       self->tensor.defined(), true,
       paddle::platform::errors::Fatal(
-          "Calling InitEagerTensorWithNumpyValue of Eager Tensor without "
-          "EmptyEagerTensorInitializer is "
+          "Calling InitTensorWithNumpyValue of Eager Tensor without "
+          "EmptyTensorInitializer is "
           "forbidden. Please check your code and make sure you new a "
           "eager tensor before init it with NumPy."));
   pten::DenseTensor* impl_ptr =
       static_cast<pten::DenseTensor*>(self->tensor.impl().get());
   paddle::platform::Place place = impl_ptr->place();
-  paddle::framework::LoDTensor temp_tensor = paddle::framework::LoDTensor();
   if (platform::is_cpu_place(place)) {
-    SetTensorFromPyArray<platform::CPUPlace>(&temp_tensor, array, place,
-                                             zero_copy);
+    SetTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_xpu_place(place)) {
-    SetTensorFromPyArray<platform::XPUPlace>(&temp_tensor, array, place,
-                                             zero_copy);
+    SetTensorFromPyArray<platform::XPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_gpu_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPlace>(&temp_tensor, array, place,
+    SetTensorFromPyArray<platform::CUDAPlace>(impl_ptr, array, place,
                                               zero_copy);
   } else if (platform::is_cuda_pinned_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPinnedPlace>(&temp_tensor, array, place,
+    SetTensorFromPyArray<platform::CUDAPinnedPlace>(impl_ptr, array, place,
                                                     zero_copy);
   } else if (platform::is_npu_place(place)) {
-    SetTensorFromPyArray<platform::NPUPlace>(&temp_tensor, array, place,
-                                             zero_copy);
+    SetTensorFromPyArray<platform::NPUPlace>(impl_ptr, array, place, zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
         "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace"));
   }
-  *impl_ptr = temp_tensor;
 }
 
-void InitEagerTensorWithEagerTensor(TensorObject* self,
-                                    const paddle::experimental::Tensor& src,
-                                    const paddle::platform::Place& place,
-                                    const std::string& name) {
+void InitTensorWithTensor(TensorObject* self,
+                          const paddle::experimental::Tensor& src,
+                          const paddle::platform::Place& place,
+                          const std::string& name) {
   self->tensor.set_name(name);
   if (place == src.inner_place()) {
     auto impl = std::static_pointer_cast<pten::DenseTensor>(src.impl());
@@ -158,10 +153,10 @@ void InitEagerTensorWithEagerTensor(TensorObject* self,
   }
 }
 
-void InitEagerTensorWithFrameworkTensor(TensorObject* self,
-                                        const framework::Tensor& src,
-                                        const paddle::platform::Place& place,
-                                        const std::string& name) {
+void InitTensorWithFrameworkTensor(TensorObject* self,
+                                   const framework::Tensor& src,
+                                   const paddle::platform::Place& place,
+                                   const std::string& name) {
   self->tensor.set_name(name);
   if (place == src.place()) {
     self->tensor.set_impl(std::make_shared<pten::DenseTensor>(src));
@@ -271,14 +266,14 @@ std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
   return act_name;
 }
 
-// initialize EagerTensor by PyArray(first argument is PyArray,
+// initialize Tensor by PyArray(first argument is PyArray,
 // mix args and kwargs) automatically.
-void AutoInitEagerTensorByPyArray(
-    TensorObject* py_tensor_ptr,
-    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
-    bool flag_kwargs, Py_ssize_t args_num) {
-  // The first argument of the EagerTensor constructor is PyArray,
-  // there are 6 arguments to construct the new EagerTensor,
+void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
+                             std::unordered_map<std::string, PyObject*> kws_map,
+                             PyObject* args, bool flag_kwargs,
+                             Py_ssize_t args_num) {
+  // The first argument of the Tensor constructor is PyArray,
+  // there are 6 arguments to construct the new Tensor,
   // kw_order_map's key is every arguments of the constructor,
   // kw_order_map's value is the position of the arguments respectively.
   // If u want to update this constructor with new arguments,
@@ -306,20 +301,21 @@ void AutoInitEagerTensorByPyArray(
   stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args,
                                    flag_kwargs, args_num);
 
-  EmptyEagerTensorInitializer(py_tensor_ptr, act_name, place, persistable,
-                              stop_gradient);
-  InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
+  EmptyTensorInitializer(py_tensor_ptr, act_name, place, persistable,
+                         stop_gradient);
+  InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
 }
 
-// initialize EagerTensor by EagerTensor or framework::Tensor (mix args and
+// initialize Tensor by Tensor or framework::Tensor (mix args and
 // kwargs) automatically.
-void AutoInitEagerTensorByTensor(
-    TensorObject* py_tensor_ptr,
-    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
-    bool flag_kwargs, Py_ssize_t args_num, bool init_by_egr_tensor = true) {
-  // The first argument of the EagerTensor constructor is EagerTensor or
+void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
+                            std::unordered_map<std::string, PyObject*> kws_map,
+                            PyObject* args, bool flag_kwargs,
+                            Py_ssize_t args_num,
+                            bool init_by_egr_tensor = true) {
+  // The first argument of the Tensor constructor is Tensor or
   // framework Tensor,
-  // there are 3 arguments to construct the new EagerTensor,
+  // there are 3 arguments to construct the new Tensor,
   // kw_order_map's key is every arguments of the constructor,
   // kw_order_map's value is the position of the arguments respectively.
   // If u want to update this constructor with new arguments,
@@ -345,14 +341,14 @@ void AutoInitEagerTensorByTensor(
         src_tensor = CastPyArg2Tensor(kws_map["value"], 0);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "The first expected kwargs is {value: EagerTensor}, "
-            "but could not parse the first argument {value: EagerTensor} "
+            "The first expected kwargs is {value: Tensor}, "
+            "but could not parse the first argument {value: Tensor} "
             "successfully. "
             "Please check your input first and make sure you are on the right "
             "way."));
       }
     }
-    InitEagerTensorWithEagerTensor(py_tensor_ptr, src_tensor, place, act_name);
+    InitTensorWithTensor(py_tensor_ptr, src_tensor, place, act_name);
   } else {
     // init by framework tensor
     framework::Tensor src_tensor;
@@ -372,8 +368,7 @@ void AutoInitEagerTensorByTensor(
             "way."));
       }
     }
-    InitEagerTensorWithFrameworkTensor(py_tensor_ptr, src_tensor, place,
-                                       act_name);
+    InitTensorWithFrameworkTensor(py_tensor_ptr, src_tensor, place, act_name);
   }
 }
 
@@ -402,12 +397,12 @@ void AutoInitEagerTensorByTensor(
    * ** value: ndarray)
    * 5.
    * def __init__ (
-   * ** tensor: EagerTensor)
+   * ** tensor: Tensor)
    * 6. (multi-place)
    * (should have at least one parameter, one parameter equals to case 5, zero
    * parameter equals to case 1.)
    * def __init__ (
-   * ** tensor: EagerTensor,
+   * ** tensor: Tensor,
    * ** place: paddle::platform::Place,
    * ** name: std::string)
    * 7. (multi-place) (should have at least one parameter, one parameter similar
@@ -417,7 +412,7 @@ void AutoInitEagerTensorByTensor(
    * ** place: paddle::platform::Place,
    * ** name: std::string)
    *  **/
-int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
+int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   // set a flag to record use kwargs or not
   bool flag_kwargs = false;
   if (kwargs) flag_kwargs = true;
@@ -427,7 +422,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   PyObject* kw_persistable = NULL;
   PyObject* kw_stop_gradient = NULL;
 
-  PyObject* kw_value = NULL;  // receive PyArray or EagerTensor
+  PyObject* kw_value = NULL;  // receive PyArray or Tensor
   PyObject* kw_place = NULL;
   PyObject* kw_name = NULL;
   PyObject* kw_dims = NULL;
@@ -490,7 +485,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
     if (!flag_kwargs) {
       // case 1
       VLOG(6) << "Calling case1's initializer.";
-      EmptyEagerTensorInitializer(
+      EmptyTensorInitializer(
           py_tensor_ptr,
           egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
           egr::Controller::Instance().GetExpectedPlace());
@@ -499,28 +494,28 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       if (kw_value != NULL) {
         if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) {
           VLOG(6) << "Calling case3's or case4's initializer";
-          AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args,
-                                       flag_kwargs, args_num);
+          AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                  args_num);
           return 0;
         } else if (PyObject_IsInstance(
                        kw_value, reinterpret_cast<PyObject*>(p_tensor_type))) {
           VLOG(6) << "Calling case5's or case6's initializer";
-          AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                      args_num);
+          AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                 args_num);
           return 0;
         } else if (PyObject_IsInstance(kw_value,
                                        reinterpret_cast<PyObject*>(
                                            g_framework_tensor_pytype))) {
           VLOG(6) << "Calling case7's initializer.";
-          AutoInitEagerTensorByTensor(
-              py_tensor_ptr, kws_map, args, flag_kwargs, args_num,
-              /* false means not init by egr tensor*/ false);
+          AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                 args_num,
+                                 /* false means not init by egr tensor*/ false);
           return 0;
         } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Could not parse the first keyword argument successfully, "
               "the first keyword argument is value, but it should be PyArray "
-              "or EagerTensor or framework::Tensor. "
+              "or Tensor or framework::Tensor. "
               "Please check your input first and make sure you are on the "
               "right way."));
         }
@@ -573,18 +568,18 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
             CastPyArg2ProtoType(kw_type, 0);
         bool persistable = CastPyArg2AttrBoolean(kw_persistable, 0);
 
-        EmptyEagerTensorInitializer(
-            py_tensor_ptr, act_name,
-            egr::Controller::Instance().GetExpectedPlace(), persistable,
-            /* stop_gradient */ true, dtype, dims, var_type);
+        EmptyTensorInitializer(py_tensor_ptr, act_name,
+                               egr::Controller::Instance().GetExpectedPlace(),
+                               persistable,
+                               /* stop_gradient */ true, dtype, dims, var_type);
 
         return 0;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "We not only support construct EagerTensor from numpy value "
-            "or tensor(EagerTensor or framework::Tensor) "
+            "We not only support construct Tensor from numpy value "
+            "or tensor(Tensor or framework::Tensor) "
             "with python kwargs by this initializer, "
-            "but also even support dtype to init a empty EagerTensor. "
+            "but also even support dtype to init a empty Tensor. "
             "Please check your input first and make sure you call the existed "
             "constructor."));
       }
@@ -595,28 +590,28 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
     PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
     if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
       VLOG(6) << "Calling case3's or case4's initializer.";
-      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                   args_num);
+      AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                              args_num);
       return 0;
     } else if (PyObject_IsInstance(
                    arg0_ptr, reinterpret_cast<PyObject*>(p_tensor_type))) {
       VLOG(6) << "Calling case5's or case6's initializer.";
-      AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                  args_num);
+      AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                             args_num);
       return 0;
     } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
                                                  g_framework_tensor_pytype))) {
       VLOG(6) << "Calling case7's initializer.";
-      AutoInitEagerTensorByTensor(
-          py_tensor_ptr, kws_map, args, flag_kwargs, args_num,
-          /* false means not init by egr tensor*/ false);
+      AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                             args_num,
+                             /* false means not init by egr tensor*/ false);
       return 0;
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "We support construct EagerTensor from numpy value "
-          "or tensor(EagerTensor or framework::Tensor) "
+          "We support construct Tensor from numpy value "
+          "or tensor(Tensor or framework::Tensor) "
           "with python args and kwargs by this initializer, "
-          "but the first argument should be PyArray or EagerTensor or "
+          "but the first argument should be PyArray or Tensor or "
           "framework::Tensor. "
           "Please check your input first and make sure you call the existed "
           "constructor."));
@@ -626,8 +621,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
     PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
     if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
       VLOG(6) << "Calling case3's or case4's initializer.";
-      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                   args_num);
+      AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                              args_num);
       return 0;
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -658,15 +653,14 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
         paddle::framework::proto::VarType::Type var_type =
             CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 3), 3);
         bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
-        EmptyEagerTensorInitializer(
-            py_tensor_ptr, act_name,
-            egr::Controller::Instance().GetExpectedPlace(), persistable, true,
-            dtype, dims, var_type);
+        EmptyTensorInitializer(py_tensor_ptr, act_name,
+                               egr::Controller::Instance().GetExpectedPlace(),
+                               persistable, true, dtype, dims, var_type);
         return 0;
       } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
         VLOG(6) << "Calling case3's initializer.";
-        AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                     args_num);
+        AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                args_num);
         return 0;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
@@ -680,8 +674,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
       if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
         VLOG(6) << "Calling case3's or case4's initializer";
-        AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                     args_num);
+        AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                args_num);
         return 0;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
@@ -696,8 +690,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
     if (!flag_kwargs) {
       // case 3
       VLOG(6) << "Calling case3's initializer.";
-      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
-                                   args_num);
+      AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                              args_num);
       return 0;
     } else {  // six position args, remainting arguments are kwargs, but this
               // is not a right way
@@ -716,7 +710,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   return 1;
 }
 
-static void EagerTensorDealloc(TensorObject* self) {
+static void TensorDealloc(TensorObject* self) {
   self->tensor.~Tensor();
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
 }
@@ -735,19 +729,19 @@ void BindEager(pybind11::module* module) {
   auto& internals = pybind11::detail::get_internals();
   auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
       internals.default_metaclass->tp_alloc(internals.default_metaclass, 0));
-  heap_type->ht_name = ToPyObject("EagerTensor");
-  heap_type->ht_qualname = ToPyObject("EagerTensor");
+  heap_type->ht_name = ToPyObject("Tensor");
+  heap_type->ht_qualname = ToPyObject("Tensor");
   auto type = &heap_type->ht_type;
-  type->tp_name = "EagerTensor";
+  type->tp_name = "Tensor";
   type->tp_basicsize = sizeof(TensorObject);
-  type->tp_dealloc = (destructor)EagerTensorDealloc;
+  type->tp_dealloc = (destructor)TensorDealloc;
   type->tp_as_number = &number_methods;
   type->tp_as_sequence = &sequence_methods;
   type->tp_as_mapping = &mapping_methods;
   type->tp_methods = variable_methods;
   type->tp_getset = variable_properties;
-  type->tp_init = EagerTensorInit;
-  type->tp_new = EagerTensorNew;
+  type->tp_init = TensorInit;
+  type->tp_new = TensorNew;
   Py_INCREF(internals.instance_base);
   type->tp_base = reinterpret_cast<PyTypeObject*>(internals.instance_base);
   type->tp_flags |=
@@ -764,8 +758,8 @@ void BindEager(pybind11::module* module) {
   }
 
   Py_INCREF(type);
-  if (PyModule_AddObject(m.ptr(), "EagerTensor",
-                         reinterpret_cast<PyObject*>(type)) < 0) {
+  if (PyModule_AddObject(m.ptr(), "Tensor", reinterpret_cast<PyObject*>(type)) <
+      0) {
     Py_DECREF(type);
     Py_DECREF(m.ptr());
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index a32edae2ad23cc215a0e91756fd6b54b145debda..c3f0aa2ec9c49d144f45d73d275c964f341a384b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -145,9 +145,8 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_api_read_next_eager_tensor_list(PyObject* self,
-                                                       PyObject* args,
-                                                       PyObject* kwargs) {
+static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args,
+                                                 PyObject* kwargs) {
   EAGER_TRY
   auto tensor_base_list =
       CastPyArg2VectorOfTensorBase(PyTuple_GET_ITEM(args, 0), 0);
@@ -182,8 +181,8 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"read_next_eager_tensor_list",
-     (PyCFunction)(void (*)(void))eager_api_read_next_eager_tensor_list,
+    {"read_next_tensor_list",
+     (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 68653790366084ed8cce1cb007cd975fd0a4bc59..b8f462dfd51d1234b86a6b294628bbefd8a5c021 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -35,15 +35,15 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-extern void InitEagerTensorWithNumpyValue(TensorObject* self,
-                                          const pybind11::object& array,
-                                          bool zero_copy);
+extern void InitTensorWithNumpyValue(TensorObject* self,
+                                     const pybind11::object& array,
+                                     bool zero_copy);
 
 extern PyTypeObject* p_tensor_type;
 
-static PyObject* eager_tensor_method_numpy(TensorObject* self, PyObject* args,
-                                           PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
+                                     PyObject* kwargs) {
+  EAGER_TRY
   PADDLE_ENFORCE_EQ(
       self->tensor.initialized(), true,
       platform::errors::InvalidArgument(
@@ -99,18 +99,17 @@ static PyObject* eager_tensor_method_numpy(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method__is_initialized(TensorObject* self,
-                                                     PyObject* args,
-                                                     PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method__is_initialized(TensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_TRY
   return ToPyObject(self->tensor.initialized());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method__copy_to(TensorObject* self,
-                                              PyObject* args,
-                                              PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
@@ -123,10 +122,10 @@ static PyObject* eager_tensor_method__copy_to(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method_reconstruct_from_(TensorObject* self,
-                                                       PyObject* args,
-                                                       PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
+                                                 PyObject* args,
+                                                 PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor src_tensor =
       CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
   std::string orig_name = self->tensor.name();
@@ -144,9 +143,9 @@ static PyObject* eager_tensor_method_reconstruct_from_(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method_copy_(TensorObject* self, PyObject* args,
-                                           PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
+                                     PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor src_tensor =
       CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
@@ -170,8 +169,8 @@ static PyObject* eager_tensor_method_copy_(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_retain_grads(TensorObject* self, PyObject* args,
-                                           PyObject* kwargs) {
+static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
+                                     PyObject* kwargs) {
   EAGER_TRY
   if (egr::Controller::Instance().HasGrad()) {
     auto meta = egr::EagerUtils::autograd_meta(&(self->tensor));
@@ -187,10 +186,9 @@ static PyObject* eager_tensor_retain_grads(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__clear_gradient(TensorObject* self,
-                                              PyObject* args,
-                                              PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
   VLOG(4) << "ClearGradient " << self->tensor.name();
 
   paddle::experimental::Tensor* grad;
@@ -223,8 +221,8 @@ static PyObject* eager_tensor__clear_gradient(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__zero_grads(TensorObject* self, PyObject* args,
-                                          PyObject* kwargs) {
+static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
+                                    PyObject* kwargs) {
   EAGER_TRY
   VLOG(4) << "ZeroGrads " << self->tensor.name();
 
@@ -257,10 +255,9 @@ static PyObject* eager_tensor__zero_grads(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__share_buffer_to(TensorObject* self,
-                                               PyObject* args,
-                                               PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor* dst_ptr =
       &(reinterpret_cast<TensorObject*>(PyTuple_GET_ITEM(args, 0))->tensor);
   PADDLE_ENFORCE_EQ(self->tensor.initialized(), true,
@@ -279,10 +276,10 @@ static PyObject* eager_tensor__share_buffer_to(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__is_shared_buffer_with(TensorObject* self,
-                                                     PyObject* args,
-                                                     PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor__is_shared_buffer_with(TensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor* dst_ptr =
       &(reinterpret_cast<TensorObject*>(PyTuple_GET_ITEM(args, 0))->tensor);
   PADDLE_ENFORCE_EQ(self->tensor.initialized(), true,
@@ -303,10 +300,10 @@ static PyObject* eager_tensor__is_shared_buffer_with(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__share_underline_tensor_to(TensorObject* self,
-                                                         PyObject* args,
-                                                         PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor__share_underline_tensor_to(TensorObject* self,
+                                                   PyObject* args,
+                                                   PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor* src_ptr =
       &(reinterpret_cast<TensorObject*>(PyTuple_GET_ITEM(args, 0))->tensor);
   PADDLE_ENFORCE_EQ(self->tensor.initialized(), true,
@@ -320,9 +317,10 @@ static PyObject* eager_tensor__share_underline_tensor_to(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor__is_shared_underline_tensor_with(
-    TensorObject* self, PyObject* args, PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor__is_shared_underline_tensor_with(TensorObject* self,
+                                                         PyObject* args,
+                                                         PyObject* kwargs) {
+  EAGER_TRY
   paddle::experimental::Tensor src_tensor =
       CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
   PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
@@ -339,9 +337,9 @@ static PyObject* eager_tensor__is_shared_underline_tensor_with(
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method_detach(TensorObject* self, PyObject* args,
-                                            PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method_detach(TensorObject* self, PyObject* args,
+                                      PyObject* kwargs) {
+  EAGER_TRY
   PADDLE_ENFORCE_EQ(
       self->tensor.initialized(), true,
       platform::errors::InvalidArgument("Tensor %s has not been initialized!",
@@ -365,10 +363,10 @@ static PyObject* eager_tensor_method_detach(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_tensor_method_get_underline_tensor(TensorObject* self,
-                                                          PyObject* args,
-                                                          PyObject* kwargs) {
-  EAGER_SYNC_TRY
+static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
   if (self->tensor.is_dense_tensor()) {
     auto* tensor =
         static_cast<paddle::framework::LoDTensor*>(self->tensor.impl().get());
@@ -382,57 +380,54 @@ static PyObject* eager_tensor_method_get_underline_tensor(TensorObject* self,
 }
 
 // NOTE(wuweilong): Set value and not change self's original place
-static PyObject* eager_tensor_method_set_value(TensorObject* self,
-                                               PyObject* args,
-                                               PyObject* kwargs) {
+static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
   EAGER_TRY
   VLOG(4) << "Value " << self->tensor.name();
   pybind11::object numpy_value =
       pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
-  InitEagerTensorWithNumpyValue(self, numpy_value, false);
+  InitTensorWithNumpyValue(self, numpy_value, false);
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
 PyMethodDef variable_methods[] = {
-    {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
+    {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_initialized",
-     (PyCFunction)(void (*)(void))eager_tensor_method__is_initialized,
+     (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_copy_to", (PyCFunction)(void (*)(void))eager_tensor_method__copy_to,
+    {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"copy_", (PyCFunction)(void (*)(void))eager_tensor_method_copy_,
+    {"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"reconstruct_from_",
-     (PyCFunction)(void (*)(void))eager_tensor_method_reconstruct_from_,
+     (PyCFunction)(void (*)(void))tensor_method_reconstruct_from_,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"retain_grads", (PyCFunction)(void (*)(void))eager_tensor_retain_grads,
+    {"retain_grads", (PyCFunction)(void (*)(void))tensor_retain_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_clear_gradient",
-     (PyCFunction)(void (*)(void))eager_tensor__clear_gradient,
+    {"_clear_gradient", (PyCFunction)(void (*)(void))tensor__clear_gradient,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
+    {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_share_buffer_to",
-     (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to,
+    {"_share_buffer_to", (PyCFunction)(void (*)(void))tensor__share_buffer_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_shared_buffer_with",
-     (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with,
+     (PyCFunction)(void (*)(void))tensor__is_shared_buffer_with,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_share_underline_tensor_to",
-     (PyCFunction)(void (*)(void))eager_tensor__share_underline_tensor_to,
+     (PyCFunction)(void (*)(void))tensor__share_underline_tensor_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_shared_underline_tensor_with",
-     (PyCFunction)(void (*)(void))eager_tensor__is_shared_underline_tensor_with,
+     (PyCFunction)(void (*)(void))tensor__is_shared_underline_tensor_with,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
+    {"detach", (PyCFunction)(void (*)(void))tensor_method_detach,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"get_tensor",
-     (PyCFunction)(void (*)(void))eager_tensor_method_get_underline_tensor,
+     (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))eager_tensor_method_set_value,
+    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index cd3617287d326fd45eb1386096d72d750a021e8f..8fea463baae5276d0c80a24057466b72ff32731b 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -79,10 +79,10 @@ const char* CAST_VAR_LIST_TEMPLATE = R"(
     auto %s = GetTensorListFromArgs("%s", "%s", args, %d, %s);)";
 
 const char* CAST_VAR_PTR_TEMPLATE = R"(
-    auto %s = GetEagerTensorPtrFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetTensorPtrFromArgs("%s", "%s", args, %d, %s);)";
 
 const char* CAST_VAR_PTR_LIST_TEMPLATE = R"(
-    auto %s = GetEagerTensorPtrListFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetTensorPtrListFromArgs("%s", "%s", args, %d, %s);)";
 
 const char* CAST_SIZE_T_TEMPLATE = R"(
     auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 942df3f69dac04fc91c524c0a3bb85bdad552dd0..fb1dc4d26b5ff8dbc88754984ab643e0b194b941 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -35,14 +35,14 @@ namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
-PyObject* eager_tensor_properties_get_name(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_name(TensorObject* self, void* closure) {
+  EAGER_TRY
   return ToPyObject(self->tensor.name());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* eager_tensor_properties_get_type(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
+  EAGER_TRY
   if (self->tensor.is_dense_tensor()) {
     return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
   } else {
@@ -52,24 +52,24 @@ PyObject* eager_tensor_properties_get_type(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-int eager_tensor_properties_set_name(TensorObject* self, PyObject* value,
-                                     void* closure) {
-  EAGER_SYNC_TRY
+int tensor_properties_set_name(TensorObject* self, PyObject* value,
+                               void* closure) {
+  EAGER_TRY
   self->tensor.set_name(CastPyArg2AttrString(value, 0));
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
 
-PyObject* eager_tensor_properties_get_stop_gradient(TensorObject* self,
-                                                    void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_stop_gradient(TensorObject* self,
+                                              void* closure) {
+  EAGER_TRY
   auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
   return ToPyObject(meta->StopGradient());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* eager_tensor_properties_get_grad(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
+  EAGER_TRY
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
     std::shared_ptr<egr::GradNodeBase> grad_node =
         egr::EagerUtils::grad_node(self->tensor);
@@ -94,9 +94,9 @@ PyObject* eager_tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-int eager_tensor_properties_set_grad(TensorObject* self, PyObject* value,
-                                     void* closure) {
-  EAGER_SYNC_TRY
+int tensor_properties_set_grad(TensorObject* self, PyObject* value,
+                               void* closure) {
+  EAGER_TRY
   auto src = CastPyArg2Tensor(value, 0);
   PADDLE_ENFORCE(
       egr::egr_utils_api::IsLeafTensor(self->tensor),
@@ -115,34 +115,33 @@ int eager_tensor_properties_set_grad(TensorObject* self, PyObject* value,
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
 
-int eager_tensor_properties_set_stop_gradient(TensorObject* self,
-                                              PyObject* value, void* closure) {
-  EAGER_SYNC_TRY
+int tensor_properties_set_stop_gradient(TensorObject* self, PyObject* value,
+                                        void* closure) {
+  EAGER_TRY
   auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
   meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
 
-PyObject* eager_tensor_properties_get_persistable(TensorObject* self,
-                                                  void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_persistable(TensorObject* self, void* closure) {
+  EAGER_TRY
   auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
   return ToPyObject(meta->Persistable());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-int eager_tensor_properties_set_persistable(TensorObject* self, PyObject* value,
-                                            void* closure) {
-  EAGER_SYNC_TRY
+int tensor_properties_set_persistable(TensorObject* self, PyObject* value,
+                                      void* closure) {
+  EAGER_TRY
   auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
   meta->SetPersistable(CastPyArg2AttrBoolean(value, 0));
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
 
-PyObject* eager_tensor_properties_get_shape(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
+  EAGER_TRY
   auto ddim = self->tensor.shape();
   std::vector<int64_t> value;
   size_t rank = static_cast<size_t>(ddim.size());
@@ -155,50 +154,45 @@ PyObject* eager_tensor_properties_get_shape(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* eager_tensor_properties_get_place(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_place(TensorObject* self, void* closure) {
+  EAGER_TRY
   return ToPyObject(self->tensor.inner_place());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* eager_tensor_properties_get_place_str(TensorObject* self,
-                                                void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_place_str(TensorObject* self, void* closure) {
+  EAGER_TRY
   std::stringstream ostr;
   ostr << self->tensor.inner_place();
   return ToPyObject(ostr.str());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* eager_tensor_properties_get_dtype(TensorObject* self, void* closure) {
-  EAGER_SYNC_TRY
+PyObject* tensor_properties_get_dtype(TensorObject* self, void* closure) {
+  EAGER_TRY
   return ToPyObject(
       paddle::framework::TransToProtoVarType(self->tensor.type()));
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
 struct PyGetSetDef variable_properties[] = {
-    {"grad", (getter)eager_tensor_properties_get_grad,
-     (setter)eager_tensor_properties_set_grad, nullptr, nullptr},
-    {"name", (getter)eager_tensor_properties_get_name,
-     (setter)eager_tensor_properties_set_name, nullptr, nullptr},
-    {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient,
-     (setter)eager_tensor_properties_set_stop_gradient, nullptr, nullptr},
-    {"persistable", (getter)eager_tensor_properties_get_persistable,
-     (setter)eager_tensor_properties_set_persistable, nullptr, nullptr},
-    {"shape", (getter)eager_tensor_properties_get_shape, nullptr, nullptr,
-     nullptr},
-    // {"is_leaf", (getter)eager_tensor_properties_get_is_leaf, nullptr,
+    {"grad", (getter)tensor_properties_get_grad,
+     (setter)tensor_properties_set_grad, nullptr, nullptr},
+    {"name", (getter)tensor_properties_get_name,
+     (setter)tensor_properties_set_name, nullptr, nullptr},
+    {"stop_gradient", (getter)tensor_properties_get_stop_gradient,
+     (setter)tensor_properties_set_stop_gradient, nullptr, nullptr},
+    {"persistable", (getter)tensor_properties_get_persistable,
+     (setter)tensor_properties_set_persistable, nullptr, nullptr},
+    {"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr},
+    // {"is_leaf", (getter)tensor_properties_get_is_leaf, nullptr,
     // nullptr,
     //  nullptr},
-    {"place", (getter)eager_tensor_properties_get_place, nullptr, nullptr,
-     nullptr},
-    {"_place_str", (getter)eager_tensor_properties_get_place_str, nullptr,
-     nullptr, nullptr},
-    {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr,
-     nullptr},
-    {"type", (getter)eager_tensor_properties_get_type, nullptr, nullptr,
+    {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr},
+    {"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr,
      nullptr},
+    {"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr},
+    {"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 85a39710564bc8c1b56a76035f7b2c56628ecf95..dd882ab6d970aa0572e69706ee3e90b539bf7951 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -179,7 +179,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerTensor, but got %s",
+        "EagerVariable, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -309,7 +309,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerTensor, but got %s",
+        "EagerVariable, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -597,6 +597,7 @@ std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
 
   if (PyList_Check(list)) {
     Py_ssize_t len = PyList_Size(list);
+    result.reserve(static_cast<size_t>(len));
     if (len == 0) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "%s(): argument '%s' (position %d) must be list of Tensors, but got "
@@ -609,6 +610,7 @@ std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     }
   } else if (PyTuple_Check(list)) {
     Py_ssize_t len = PyTuple_Size(list);
+    result.reserve(static_cast<size_t>(len));
     if (len == 0) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "%s(): argument '%s' (position %d) must be list of Tensors, but got "
@@ -632,9 +634,11 @@ std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
   return result;
 }
 
-paddle::experimental::Tensor* GetEagerTensorPtrFromArgs(
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable) {
+paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type,
+                                                   const std::string& arg_name,
+                                                   PyObject* args,
+                                                   ssize_t arg_idx,
+                                                   bool dispensable) {
   PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
 
   if (PyTuple_Check(obj)) {
@@ -654,7 +658,7 @@ paddle::experimental::Tensor* GetEagerTensorPtrFromArgs(
   return &(reinterpret_cast<TensorObject*>(obj)->tensor);
 }
 
-std::vector<paddle::experimental::Tensor*> GetEagerTensorPtrListFromArgs(
+std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
   PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index ead9f474f675b8e1f5b6949ff59a8f185839cb43..f2429768fa998bef97ca772004fa4b30d76d026d 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -65,15 +65,15 @@ PyObject* ToPyObject(
     const std::unordered_map<std::string, std::vector<std::string>>& value);
 
 template <typename Tuple, size_t N>
-struct TupleEagerTensorResult {
+struct TupleTensorResult {
   static void Run(const Tuple& out, PyObject* result) {
-    TupleEagerTensorResult<Tuple, N - 1>::Run(out, result);
+    TupleTensorResult<Tuple, N - 1>::Run(out, result);
     PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
   }
 };
 
 template <typename Tuple>
-struct TupleEagerTensorResult<Tuple, 1> {
+struct TupleTensorResult<Tuple, 1> {
   static void Run(const Tuple& out, PyObject* result) {
     PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
   }
@@ -84,7 +84,7 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   auto len = sizeof...(Args);
   PyObject* result = PyTuple_New(len);
 
-  TupleEagerTensorResult<decltype(out), sizeof...(Args)>::Run(out, result);
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result);
 
   return result;
 }
@@ -97,10 +97,12 @@ std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
-paddle::experimental::Tensor* GetEagerTensorPtrFromArgs(
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable = false);
-std::vector<paddle::experimental::Tensor*> GetEagerTensorPtrListFromArgs(
+paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type,
+                                                   const std::string& arg_name,
+                                                   PyObject* args,
+                                                   ssize_t arg_idx,
+                                                   bool dispensable = false);
+std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index 7e44841e670939ef00d010c0c1fadaccd501f6ca..cf82f464a11f292b8ba09dc4cdba4eb3db6e1d96 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 
 #define EAGER_TRY try {
-#define EAGER_SYNC_TRY try {
 #define EAGER_CATCH_AND_THROW_RETURN_NULL             \
   }                                                   \
   catch (...) {                                       \
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c84a71d8aaa002b8d40ff2713252d2cd6afff2bb..f4ed1ee3424f229d77c293d19edca911aea31f69 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::Place>();
   } else if (py::isinstance<platform::MLUPlace>(place_obj)) {
     return place_obj.cast<platform::MLUPlace>();
+  } else if (py::isinstance<platform::CustomPlace>(place_obj)) {
+    return place_obj.cast<platform::CustomPlace>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace"));
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/"
+        "CustomPlace"));
   }
 }
 
@@ -183,6 +186,9 @@ static void InitVarBaseAndTensor(
     SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_mlu_place(place)) {
     SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
+  } else if (platform::is_custom_place(place)) {
+    SetTensorFromPyArray<platform::CustomPlace>(tensor, array, place,
+                                                zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
@@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) {
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
            py::arg("zero_copy") = false, py::arg("name") = "",
            py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"),
            py::arg("name") = "")
@@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) {
            py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("name") = "")
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
       .def(
           "__setitem_varbase__",
@@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
               VLOG(4) << "Tracer(" << &self << ")"
                       << " set expected place " << *p;
+            } else if (py::isinstance<platform::CustomPlace>(obj)) {
+              auto p = obj.cast<platform::CustomPlace *>();
+              self.SetExpectedPlace(*p);
+              VLOG(4) << "Tracer(" << &self << ")"
+                      << " set expected place " << *p;
             } else if (py::isinstance<platform::Place>(obj)) {
               auto p = obj.cast<platform::Place *>();
               self.SetExpectedPlace(*p);
@@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::CustomPlace &place,
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index f63c3111bdb3fa6e4d8060f9df0def21b3ba41b2..2b07a439d33b4a96a10a893a95e0dd26f83dd8c7 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -84,6 +84,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}},
     {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
     {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}},
+    {"nce",
+     {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
+      "CustomDistAlias", "CustomDistAliasProbs"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 959e34afe3da66987f040c81b21b410d66c7a555..5289b862dc948baacf7c373ebcee483dc589d9a6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -69,6 +69,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
+  m.def("get_all_device_type", []() {
+    std::vector<std::string> device_types;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    device_types = platform::DeviceManager::GetAllDeviceTypes();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_all_device_type because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_all_device_type, please try to install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return device_types;
+  });
+  m.def("get_all_custom_device_type", []() {
+    std::vector<std::string> device_types;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_all_custom_device_type because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_all_custom_device_type, please try to "
+              "install CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return device_types;
+  });
+  m.def("get_available_device", [] {
+    std::vector<std::string> devices;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    devices = platform::DeviceManager::GetAllDeviceList();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_available_device because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_available_device, please try to install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return devices;
+  });
+  m.def("get_available_custom_device", [] {
+    std::vector<std::string> devices;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    devices = platform::DeviceManager::GetAllCustomDeviceList();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_available_custom_device because you have "
+              "installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_available_custom_device, please try to "
+              "install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return devices;
+  });
+  py::class_<platform::CustomPlace>(m, "CustomPlace",
+                                    R"DOC(
+    CustomPlace is a descriptor of a device.
+    It represents a custom device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
+                                             )DOC")
+      .def("__init__",
+           [](platform::CustomPlace &self, const std::string &device_type,
+              int dev_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), device id must be 0 "
+                   "or "
+                   "positive integer",
+                   device_type, dev_id);
+               std::exit(-1);
+             }
+
+             if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
+                        platform::DeviceManager::IsCustom(device_type))) {
+               int dev_count = static_cast<int>(
+                   platform::DeviceManager::GetDeviceCount(device_type));
+               if (UNLIKELY(dev_id >= dev_count)) {
+                 if (dev_count == 0) {
+                   LOG(ERROR) << "Cannot use " << device_type
+                              << " because there is no " << device_type
+                              << " detected on your "
+                                 "machine.";
+                   std::exit(-1);
+                 } else {
+                   LOG(ERROR) << string::Sprintf(
+                       "Invalid CustomPlace(%s, %d), dev_id must "
+                       "inside "
+                       "[0, %d), because %s "
+                       "number on your machine is %d",
+                       device_type, dev_id, dev_count, device_type, dev_count);
+                   std::exit(-1);
+                 }
+               }
+               new (&self) platform::CustomPlace(device_type, dev_id);
+             } else {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), the device type is "
+                   "not registered "
+                   "as a custom device.",
+                   device_type, dev_id);
+               std::exit(-1);
+             }
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use CustomDevice because you have installed CPU/GPU"
+                 "version PaddlePaddle.\n"
+                 "If you want to use CustomDevice, please try to install"
+                 "CustomDevice version "
+                 "PaddlePaddle by: pip install paddlepaddle-core\n"
+                 "If you only have CPU, please change "
+                 "CustomPlace(%s, %d) to be CPUPlace().\n",
+                 device_type, dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("get_device_id",
+           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
+      .def("get_device_type",
+           [](const platform::CustomPlace &self) {
+             return self.GetDeviceType();
+           })
+      .def("__repr__", string::to_string<const platform::CustomPlace &>)
+      .def("__str__", string::to_string<const platform::CustomPlace &>);
   py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
 
     CUDAPlace is a descriptor of a device.
@@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("is_mlu_place",
            [](platform::Place &self) { return platform::is_mlu_place(self); })
+      .def(
+          "is_custom_place",
+          [](platform::Place &self) { return platform::is_custom_place(self); })
       .def("gpu_device_id", [](platform::Place &self) { return self.device; })
       .def("xpu_device_id", [](platform::Place &self) { return self.device; })
       .def("npu_device_id", [](platform::Place &self) { return self.device; })
       .def("ipu_device_id", [](platform::Place &self) { return self.device; })
       .def("mlu_device_id", [](platform::Place &self) { return self.device; })
+      .def("custom_device_id",
+           [](platform::Place &self) { return self.device; })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::MLUPlace &mlu_place) {
              self = mlu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::CustomPlace &plug_place) {
+             self = plug_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 9a11c5946f318b7e861b853d301e103e641d2722..f1983175bdf94fa6e9fcee49e6f85e7bdf6f4765 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
     auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_custom_place(self.place())) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    const T *a = self.data<T>();
+    auto p = self.place();
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
+                         nullptr);
 #endif
   }
   VLOG(10) << "TensorGetElement, place: " << self.place()
@@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_custom_place(self->place())) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    auto p = self->place();
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
+                         nullptr);
 #endif
   }
 }
@@ -368,6 +383,24 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use MLUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with MLU support."));
+#endif
+  } else if (paddle::platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    platform::Place tmp_place = place;
+    platform::DeviceGuard guard(tmp_place);
+    auto dst = self->mutable_data<T>(place);
+
+    platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
+        reinterpret_cast<void *>(dst),
+        const_cast<void *>(reinterpret_cast<const void *>(array.data())),
+        array.nbytes());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CustomDevice in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   bool is_npu_tensor = platform::is_npu_place(tensor.place());
   bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
+  bool is_custom_device_tensor = platform::is_custom_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype());
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(
       framework::TransToProtoVarType(tensor.dtype()));
 
-  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor &&
+      !is_custom_device_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
         "Please recompile or reinstall Paddle with MLU support."));
+#endif
+  } else if (is_custom_device_tensor) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), tensor.place(),
+        tensor_buf_ptr, copy_bytes,
+        reinterpret_cast<const platform::CustomDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with CustomPlace "
+        "support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 62e54e26eda990e32a62c4ba99070b78d5c6275d..c8253effe8488946dfaa3c3bd4812c73d7f938d8 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -107,6 +107,6 @@ endif()
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto infrt_naive)
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto)
-add_dependencies(infrt ${infrt_mlir_incs})
+add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index ce38c53617c711e25bf559f8aa668e5da253955d..757d47a8de43e2a394ad5296e617ed6ed94078f3 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -31,9 +31,9 @@ target_link_libraries(infrtopt infrt)
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
 add_dependencies(print-ir pd_ops_inc)
-
 cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
+add_subdirectory(infrt)
 add_subdirectory(tensorrt)
 
 if (INFRT_WITH_PTEN)
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98910d8d0ecf0b99bd1eb8b860ed573ae88ef203
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_dialect.cc
+    )
+
+add_mlir_dialect(infrt_ops Infrt)
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc
new file mode 100644
index 0000000000000000000000000000000000000000..388de858b6572ea5900851b170d09589387c0b05
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/DialectImplementation.h>
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc"
+
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
+
+namespace infrt {
+
+void InfrtDialect::initialize() {
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
+      >();
+}
+
+/// Parse a type registered to this dialect.
+mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return nullptr;
+  // parse TensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5>
+  // 5 is the lod_level
+  if (keyword == "lod_tensor") {
+    // Parse the size and elementType.
+    llvm::SmallVector<int64_t, 4> shape;
+    mlir::Type elementType;
+    int32_t lod_level = 0;
+    // parse "<"
+    if (parser.parseLess()) return nullptr;
+
+    if (parser.parseDimensionList(shape)) return nullptr;
+
+    // Parse the element type.
+    if (parser.parseType(elementType)) return nullptr;
+    // parse ","
+    if (parser.parseComma()) return nullptr;
+
+    // llvm::APInt lod_level;
+    if (parser.parseInteger(lod_level)) return nullptr;
+
+    // parse ">"
+    if (parser.parseGreater()) return nullptr;
+
+    return LoDTensorType::get(
+        parser.getContext(), shape, elementType, lod_level);
+  }
+  // Todo: parse other type
+  return mlir::Type();
+}
+
+void InfrtDialect::printType(::mlir::Type type,
+                             ::mlir::DialectAsmPrinter &os) const {
+  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  if (type.isa<infrt::LoDTensorType>()) {
+    auto lodTensorType = type.cast<infrt::LoDTensorType>();
+    os << "lod_tensor<";
+    auto shape = lodTensorType.getShape();
+    for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)
+      os << *dim << 'x';
+    os << shape.back() << 'x' << lodTensorType.getElementType() << ", "
+       << lodTensorType.getLod_level() << ">";
+    return;
+  }
+  llvm_unreachable("unknown infrt type.");
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/infrt_dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..21a1f6b34f6a5f33bd82c4e78669ee24221a08f1
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..319760973cd90c667793e29761c030141990c242
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -0,0 +1,52 @@
+#ifndef Infrt_OpS
+#define Infrt_OpS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def Infrt_Dialect : Dialect {
+  let summary =
+    "A dialect containing the Infrt Attributes, Operations, and Types";
+
+  let name = "Infrt";
+  let cppNamespace = "::infrt";
+}
+
+// Type definitions
+
+// Base class for Infrt dialect types.
+class Infrt_Type<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Type">
+    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
+}
+
+def LoDTensor : Infrt_Type<"LoDTensor"> {
+  let summary = "infrt lod tensor";
+  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$shape,
+    "mlir::Type":$elementType,
+    "int32_t":$lod_level
+  );
+}
+
+// Op definition
+class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
+
+  // Each registered op needs to provide all of a printer, parser and verifier.
+  // let printer = [{ return infrt::print(p, *this); }];
+  // let verifier = [{ return infrt::verify(*this); }];
+  // let parser = [{ return infrt::parse$cppClass(parser, result); }];
+}
+
+// def InfRT_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
+//  let summary = "kernel op";
+//  let description = [{
+//    kernel op!
+//  }];
+// let arguments = (ins StrAttr:$name, PD_Tensor:$X, PD_Tensor:$Y, DefaultValuedAttr<F32Attr, "1.0">:$Alpha, DefaultValuedAttr<F32Attr, "1.0">:$Beta);
+//
+// let results = (outs PD_Tensor:$Out);
+// }
+
+#endif // Infrt_OpS
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 9afefc0158715bcd17f26447631d69441b445c13..090f1aea289109feda54b12131daf2993ea4e5e0 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/pten/infrt_pten_tensor.h"
@@ -28,6 +29,7 @@ namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
   registry.insert<ts::TensorShapeDialect,
                   dialect::INFRTDialect,
+                  infrt::InfrtDialect,
                   dt::DTDialect,
                   mlir::pd::PaddleDialect,
 #ifdef INFRT_WITH_PTEN
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index 40795eb9d229757bf98d6ae0227a410280139533..a61a4645eff76cc1fdcbf5176bf4d3e9a606f89e 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -6,6 +6,7 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt/infrt_ops.td"
 
 def PD_Dialect : Dialect {
   let name = "pd";
@@ -71,7 +72,10 @@ def PD_ElementType : Type<Or<[PD_Float.predicate,
                               PD_Int.predicate]>,
                               "pd.dtype">;
 
-def PD_Tensor : TensorOf<[PD_ElementType]>;
+// def PD_Tensor : TensorOf<[PD_ElementType]>;
+def PD_Tensor1 : TensorOf<[PD_ElementType]>;
+
+def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">;
 
 def PD_Tensor_Array : VectorOf<[PD_Tensor]>;
 
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index f3b85ae4b5d0b9c93dfc4d0a1d9530c1e18da925..7cf5b2fb20f527eefe31f817c7fe85c7864c8669 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -16,6 +16,7 @@
 
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/PatternMatch.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index ca61ddc0b7053dce34b115dc443e872206960631..02511b21e4792bb37c416093a7c272090eae44c1 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -3,7 +3,7 @@
 func @ops() {
   %a = pd.feed() {name="input0"} : tensor<?xf32>
   %b = pd.feed() {name="input1"}: tensor<?xf32>
-
+  %d = pd.feed() {name="input3"}: !Infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
 
   infrt.return
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index 900de42bbac9577f25f625d4643ef7734ece9f12..1872fcc0da4d72a569083f967ed94320606ed64c 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -222,6 +222,14 @@ class PADDLE_API Tensor final {
    */
   bool is_dense_tensor() const;
 
+  /**
+   * @brief Determine whether tensor is SelectedRows
+   *
+   * @return true
+   * @return false
+   */
+  bool is_selected_rows() const;
+
   /* Part 3: Device and Backend methods */
 
   /**
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 6fb0d2706ca90267cc2e06a06ba9b570f275da2c..40f35896323b98543364428c99b20d03571dbbd7 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
 #include "paddle/pten/core/tensor_utils.h"
-
 /**
  * [ Why still include the fluid headers? ]
  *
@@ -133,7 +132,9 @@ DataLayout Tensor::layout() const { return impl_->layout(); }
 bool Tensor::is_dense_tensor() const {
   return pten::DenseTensor::classof(impl_.get());
 }
-
+bool Tensor::is_selected_rows() const {
+  return pten::SelectedRows::classof(impl_.get());
+}
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc
index e2cb934f0a1c5d5fb599bddcf44345f70ac688c2..0a3bfccb16a4b2aa83425ddc41ae141251842bac 100644
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+#include <unordered_map>
 
 #include "paddle/pten/api/ext/exception.h"
 
@@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) {
 std::string Place::DebugString() const {
   std::ostringstream os;
   os << "Place(";
-  os << AllocationTypeStr(alloc_type_);
+  if (alloc_type_ == AllocationType::CUSTOM) {
+    os << GetGlobalDeviceType(device_type_id_);
+  } else {
+    os << AllocationTypeStr(alloc_type_);
+  }
   if (alloc_type_ == AllocationType::GPUPINNED ||
       alloc_type_ == AllocationType::NPUPINNED ||
       alloc_type_ == AllocationType::CPU) {
@@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) {
   return os;
 }
 
+static std::unordered_map<std::string, size_t> global_registered_device_type_id;
+static std::unordered_map<size_t, std::string> global_registered_device_type;
+
+size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
+  if (device_type.empty()) return 0;
+  if (global_registered_device_type_id.find(device_type) ==
+      global_registered_device_type_id.end()) {
+    size_t device_type_id = global_registered_device_type_id.size() + 1;
+    global_registered_device_type_id[device_type] = device_type_id;
+    global_registered_device_type[device_type_id] = device_type;
+  }
+  return global_registered_device_type_id[device_type];
+}
+
+std::string GetGlobalDeviceType(size_t device_type_id) {
+  if (device_type_id == 0) return "";
+  return global_registered_device_type[device_type_id];
+}
+
 }  // namespace pten
diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h
index 75f1f4de9984c72200df68f1d55cf45ce7a58c98..6b7d1ea55d5c4159bd2d005518dd3631db7c05a7 100644
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
@@ -28,29 +28,49 @@ enum class AllocationType : int8_t {
   NPUPINNED = 6,
   IPU = 7,
   MLU = 8,
+  CUSTOM = 9,
 };
 
 const char* AllocationTypeStr(AllocationType type);
 
+size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
+std::string GetGlobalDeviceType(size_t device_type_id_);
+
 /// \brief The place is used to specify where the data is stored.
 class Place {
  public:
   Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {}
 
-  explicit Place(AllocationType type, int8_t id)
-      : device(id), alloc_type_(type) {}
-
-  explicit Place(AllocationType type) : device(0), alloc_type_(type) {}
-
-  void Reset(AllocationType type, int8_t device_id = 0) noexcept {
+  explicit Place(AllocationType type,
+                 int8_t id,
+                 const std::string& dev_type = "")
+      : device(id),
+        alloc_type_(type),
+        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+
+  explicit Place(AllocationType type, const std::string& dev_type = "")
+      : device(0),
+        alloc_type_(type),
+        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+
+  void Reset(AllocationType type,
+             int8_t device_id = 0,
+             const std::string& dev_type = "") noexcept {
     alloc_type_ = type;
     device = device_id;
+    if (!dev_type.empty()) {
+      device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
+    }
   }
 
   AllocationType GetType() const { return alloc_type_; }
 
   int8_t GetDeviceId() const { return device; }
 
+  std::string GetDeviceType() const {
+    return GetGlobalDeviceType(device_type_id_);
+  }
+
   std::string DebugString() const;
 
   inline bool operator==(const Place& rhs) const {
@@ -62,6 +82,10 @@ class Place {
         alloc_type_ == AllocationType::NPUPINNED) {
       return true;
     }
+    if (alloc_type_ == AllocationType::CUSTOM) {
+      return device_type_id_ == rhs.device_type_id_ &&
+             device == rhs.GetDeviceId();
+    }
     return device == rhs.GetDeviceId();
   }
   inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
@@ -69,6 +93,10 @@ class Place {
     if (alloc_type_ != rhs.GetType()) {
       return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
     }
+    if (alloc_type_ == AllocationType::CUSTOM &&
+        device_type_id_ != rhs.device_type_id_) {
+      return device_type_id_ < rhs.device_type_id_;
+    }
     return device < rhs.GetDeviceId();
   }
 
@@ -79,6 +107,7 @@ class Place {
 
  private:
   AllocationType alloc_type_{AllocationType::UNDEFINED};
+  size_t device_type_id_;
 };
 
 class CPUPlace : public Place {
@@ -157,6 +186,22 @@ class MLUPlace : public Place {
       : Place(AllocationType::MLU, place.GetDeviceId()) {}
 };
 
+class CustomPlace : public Place {
+ public:
+  explicit CustomPlace(const std::string dev_type)
+      : Place(AllocationType::CUSTOM, 0, dev_type) {}
+  CustomPlace(const std::string dev_type, int device_id)
+      : Place(AllocationType::CUSTOM, device_id, dev_type) {}
+
+  CustomPlace(const CustomPlace&) = default;
+  CustomPlace(const Place& place) {  // NOLINT
+    if (place.GetType() == AllocationType::CUSTOM) {
+      this->Reset(
+          AllocationType::CUSTOM, place.GetDeviceId(), place.GetDeviceType());
+    }
+  }
+};
+
 std::ostream& operator<<(std::ostream&, const Place&);
 
 }  // namespace pten
diff --git a/paddle/pten/core/compat/type_defs.h b/paddle/pten/core/compat/type_defs.h
index eb5459b1b6ea723d7118a2a05addc1988987efcc..c9d7d5bb54b620ceeac55de21a28e2440a15186b 100644
--- a/paddle/pten/core/compat/type_defs.h
+++ b/paddle/pten/core/compat/type_defs.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <boost/variant.hpp>
 
 namespace egr {
-class EagerTensor;
+class EagerVariable;
 }
 namespace paddle {
 namespace framework {
@@ -76,9 +76,9 @@ struct NameVarMapTrait<VariableWrapper> {
 };
 
 template <>
-struct NameVarMapTrait<egr::EagerTensor> {
+struct NameVarMapTrait<egr::EagerVariable> {
   using Type =
-      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
+      std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>;
 };
 
 }  // namespace details
@@ -88,7 +88,7 @@ using NameVarMap = typename details::NameVarMapTrait<T>::Type;
 
 using NameVarBaseMap = NameVarMap<VarBase>;
 using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-using NameTensorMap = NameVarMap<egr::EagerTensor>;
+using NameTensorMap = NameVarMap<egr::EagerVariable>;
 
 using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
 
diff --git a/paddle/pten/core/infermeta_utils.h b/paddle/pten/core/infermeta_utils.h
index 6de91db9382e22537e577ce3188764034c7235e3..59d2a4ed3c089d2480bfcbe526d2706371e322bc 100644
--- a/paddle/pten/core/infermeta_utils.h
+++ b/paddle/pten/core/infermeta_utils.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <typeindex>
+#include <typeinfo>
 #include <utility>
 
 #include "paddle/pten/common/scalar.h"
@@ -55,9 +57,12 @@ class InferMetaContext {
   AttrType AttrAt(size_t idx) {
     try {
       return paddle::any_cast<AttrType>(attrs_.at(idx));
-    } catch (paddle::bad_any_cast&) {
+    } catch (paddle::bad_any_cast& e) {
       PADDLE_THROW(pten::errors::InvalidArgument(
-          "Attribute cast error in InferMeta Context."));
+          "Attribute cast error in InferMeta Context, the expected attribute "
+          "type is `%s`, but actual attribute type is `%s`.",
+          std::type_index(typeid(AttrType)).name(),
+          std::type_index(attrs_.at(idx).type()).name()));
     }
   }
 
@@ -151,10 +156,15 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(double);
+  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<int64_t>&);
+  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+      const std::vector<std::string>&);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
   PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
index 4a05d7ed3153f1e20926bb95eaac5d2c3b5ca5db..8250179b7a28b25a673f84c235c6d0c3eeb3043c 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/pten/core/selected_rows.h
@@ -29,10 +29,6 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
-
-namespace egr {
-class EagerTensor;
-}  // namespace egr
 namespace pten {
 class SelectedRows : public TensorBase,
                      public TypeInfoTraits<TensorBase, SelectedRows> {
@@ -199,39 +195,6 @@ class SelectedRows : public TensorBase,
   std::unique_ptr<DenseTensor> value_{nullptr};
   int64_t height_;  // height indicates the underline tensor's height
   std::unique_ptr<RWLock> rwlock_{nullptr};
-  // TODO(jiabin): Remove this when we don't need EagerTensor support
-  // SelectedRows which is expected in next version.
-  /** Why we need this weird friend class?
-   * In eager mode, since some of ops doesn't support C++ API for now we need to
-   *use 'imperative::TraceOp' to run it.
-   * So, we need to support get a SelectedRows from egr::EagerTensor's
-   *framework::Variable obj and used it to reconstruct
-   * a new paddle::experimental::Tensor to support framework usage. However, we
-   *got 2 problems here.
-   * First, we got 2 unique_ptr in SelectedRows so that we can't support
-   *std::make_shared in EagerTensor's SetImplWithSelectedRows method,
-   * since we have to construct a shared_ptr for paddle::experimental::Tensor's
-   *impl.
-   * Second, when we are trying to support move constructor for SelectedRows we
-   *found that we can't get its rvalue from
-   * framework::Variable because it holds an obj of target type.
-   *
-   *
-   * The only three way to solve this problem is:
-   * 1. Just like what we have done, using friend class and just copy/move each
-   *member. In this way, we can avoid additional API
-   * and symbols.
-   * 2. Make pten::SelectedRows's member from unique_ptr to shared_ptr. However,
-   *this may cause some cost of performance.
-   * 3. Add some api to return or move member of framework::SelectedRows.
-   *However, it's not as safe as first solution.
-   * 4. Support all framework::SelectedRows related ops and make sure
-   *EagerTensor never holds framework::SelectedRows.
-   *
-   * If anyone got better ideas, welcome to contact JiabinYang, we are open for
-   *your help.
-  **/
-  friend class egr::EagerTensor;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h
index ee223afb3b03c0e2b770097e4313ce31c45927ea..ca0c678e0623d7b7a38b8d87170fc448798f7ea6 100644
--- a/paddle/pten/core/utils/data_type.h
+++ b/paddle/pten/core/utils/data_type.h
@@ -57,7 +57,7 @@ inline void VisitDataType(pten::DataType type, Visitor visitor) {
   _PtenForEachDataType_(PtenVisitDataTypeCallback);
 #undef PtenVisitDataTypeCallback
   PADDLE_THROW(pten::errors::Unimplemented(
-      "Not supported proto::VarType::Type(%d) as data type.",
+      "Not supported pten::DataType(%d) as data type.",
       static_cast<int>(type)));
 }
 }  // namespace pten
diff --git a/paddle/pten/infermeta/backward.cc b/paddle/pten/infermeta/backward.cc
index db92449519436024a01c9c891f9671756777a345..2f2fcc7db31ea51f2111103675bbd20e7ab1ec58 100644
--- a/paddle/pten/infermeta/backward.cc
+++ b/paddle/pten/infermeta/backward.cc
@@ -16,13 +16,10 @@ limitations under the License. */
 
 namespace pten {
 
-void MatmulGradInferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         const MetaTensor& out_grad_meta,
-                         bool transpose_x,
-                         bool transpose_y,
-                         MetaTensor* dx,
-                         MetaTensor* dy) {
+void GeneralBinaryGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& y,
+                                MetaTensor* dx,
+                                MetaTensor* dy) {
   if (dx) {
     dx->share_meta(x);
   }
diff --git a/paddle/pten/infermeta/backward.h b/paddle/pten/infermeta/backward.h
index d6b96861412861de6fb892a28c3930bd7db20da7..ded51cac6378c574232eed3e641def23c68c3db8 100644
--- a/paddle/pten/infermeta/backward.h
+++ b/paddle/pten/infermeta/backward.h
@@ -20,12 +20,9 @@ limitations under the License. */
 
 namespace pten {
 
-void MatmulGradInferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         const MetaTensor& out_grad_meta,
-                         bool transpose_x,
-                         bool transpose_y,
-                         MetaTensor* dx,
-                         MetaTensor* dy);
+void GeneralBinaryGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& y,
+                                MetaTensor* dx,
+                                MetaTensor* dy);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/abs_grad_kernel.h b/paddle/pten/kernels/abs_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..494f29da783d4d6b6d3f6f940d3591ace578aea1
--- /dev/null
+++ b/paddle/pten/kernels/abs_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/device_context.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void AbsGradKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   DenseTensor* dx);
+
+template <typename T, typename Context>
+void AbsDoubleGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& ddx,
+                         DenseTensor* ddout);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/abs_kernel.h b/paddle/pten/kernels/abs_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0322afadfab8d2a24a358ef7c747d09174b124f2
--- /dev/null
+++ b/paddle/pten/kernels/abs_kernel.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/device_context.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/abs_grad_kernel.cc b/paddle/pten/kernels/cpu/abs_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d6675aa7b3155d1f83f4bd098a867e5f3359938
--- /dev/null
+++ b/paddle/pten/kernels/cpu/abs_grad_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
+#include "paddle/pten/kernels/impl/abs_grad_kernel_impl.h"
+
+using pten::dtype::complex;
+
+PT_REGISTER_KERNEL(abs_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AbsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex<float>,
+                   complex<double>) {}
+PT_REGISTER_KERNEL(abs_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AbsDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex<float>,
+                   complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/abs_kernel.cc b/paddle/pten/kernels/cpu/abs_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee766a18d42aba35f4d9e45d6d381beda99798d6
--- /dev/null
+++ b/paddle/pten/kernels/cpu/abs_kernel.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/abs_kernel.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  ctx.template Alloc<pten::funcs::Real<T>>(
+      out, size_t(x.numel() * sizeof(pten::funcs::Real<T>)));
+  auto* out_data = out->data<pten::funcs::Real<T>>();
+
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  pten::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(abs,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AbsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   pten::dtype::complex<float>,
+                   pten::dtype::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/histogram_kernel.cc b/paddle/pten/kernels/cpu/histogram_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..700b7e092919aa8d922b0ebfbe8388eb646aac5b
--- /dev/null
+++ b/paddle/pten/kernels/cpu/histogram_kernel.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/histogram_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void HistogramKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     int64_t bins,
+                     int min,
+                     int max,
+                     DenseTensor* output) {
+  auto& nbins = bins;
+  auto& minval = min;
+  auto& maxval = max;
+
+  const T* input_data = input.data<T>();
+  auto input_numel = input.numel();
+
+  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  pten::funcs::SetConstant<Context, int64_t>()(
+      dev_ctx, output, static_cast<int64_t>(0));
+
+  if (input_data == nullptr) return;
+
+  T output_min = static_cast<T>(minval);
+  T output_max = static_cast<T>(maxval);
+  if (output_min == output_max) {
+    output_min = *std::min_element(input_data, input_data + input_numel);
+    output_max = *std::max_element(input_data, input_data + input_numel);
+  }
+  if (output_min == output_max) {
+    output_min = output_min - 1;
+    output_max = output_max + 1;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      (std::isinf(static_cast<float>(output_min)) ||
+       std::isnan(static_cast<float>(output_max)) ||
+       std::isinf(static_cast<float>(output_min)) ||
+       std::isnan(static_cast<float>(output_max))),
+      false,
+      pten::errors::OutOfRange("range of min, max is not finite"));
+  PADDLE_ENFORCE_GE(
+      output_max,
+      output_min,
+      pten::errors::InvalidArgument(
+          "max must be larger or equal to min. If min and max are both zero, "
+          "the minimum and maximum values of the data are used. "
+          "But received max is %d, min is %d",
+          maxval,
+          minval));
+
+  for (int64_t i = 0; i < input_numel; i++) {
+    if (input_data[i] >= output_min && input_data[i] <= output_max) {
+      const int64_t bin = (int64_t)((input_data[i] - output_min) * nbins /
+                                    (output_max - output_min));
+      out_data[std::min(bin, nbins - 1)] += 1;
+    }
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(histogram,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::HistogramKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/lerp_grad_kernel.cc b/paddle/pten/kernels/cpu/lerp_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4aac143eb16dcad6d6d31e302357babab7ed8309
--- /dev/null
+++ b/paddle/pten/kernels/cpu/lerp_grad_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/lerp_grad_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/lerp_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    lerp_grad, CPU, ALL_LAYOUT, pten::LerpGradKernel, float, double) {}
diff --git a/paddle/pten/kernels/cpu/lerp_kernel.cc b/paddle/pten/kernels/cpu/lerp_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f8513065ce9c5c52f85b0f7d9e7acfade534763
--- /dev/null
+++ b/paddle/pten/kernels/cpu/lerp_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/lerp_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/lerp_kernel_impl.h"
+
+PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, pten::LerpKernel, float, double) {}
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index ecb058d35b909bc9455b019e55ab8f2277fd587b..e1a1788815ebfef75ac29e332da3e76f3d2a5d52 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -94,6 +94,7 @@ PT_REGISTER_KERNEL(empty_like,
                    int64_t,
                    bool,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
 #endif
diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h
index 9a96a5fd45e4c48059ba8915f2108e4f9ac2aad7..e751f85b50f24bdddb475653e5e706975333242c 100644
--- a/paddle/pten/kernels/funcs/common_shape.h
+++ b/paddle/pten/kernels/funcs/common_shape.h
@@ -102,5 +102,30 @@ inline void GetPrePostNumel(
   }
 }
 
+static framework::DDim ExtendDims2Rank(const framework::DDim &in_dims,
+                                       int rank) {
+  if (in_dims.size() == rank) {
+    return in_dims;
+  }
+  std::vector<int64_t> shapes(rank, 1);
+  for (int i = in_dims.size() - 1, j = rank - 1; i >= 0; --i, --j) {
+    shapes[j] = in_dims[i];
+  }
+  return framework::make_ddim(shapes);
+}
+
+template <size_t D>
+static void GetBroadcastDims(const framework::DDim &in_dims,
+                             const framework::DDim &out_dims,
+                             Eigen::DSizes<int, D> *bcast_dims) {
+  for (size_t i = 0; i < D; ++i) {
+    if (in_dims[i] == out_dims[i]) {
+      (*bcast_dims)[i] = 1;
+    } else {
+      (*bcast_dims)[i] = std::max(in_dims[i], out_dims[i]);
+    }
+  }
+}
+
 }  // namespace funcs
 }  // namespace pten
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/pten/kernels/funcs/complex_functors.h
similarity index 57%
rename from paddle/fluid/operators/math/complex_functors.h
rename to paddle/pten/kernels/funcs/complex_functors.h
index 48f16b87cbd66c6a39c74d1dbaab2349193f04ae..b0eee3ac1fdce3c9fc7de7f8162ae74f4b33daff 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/pten/kernels/funcs/complex_functors.h
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
 #include <type_traits>
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace pten {
+namespace funcs {
 
 template <bool B, typename T>
 struct cond {
@@ -64,8 +66,8 @@ using select_t = typename select<Head, Tail...>::type;
 
 template <typename T>
 using Real =
-    select_t<cond<std::is_same<T, platform::complex<float>>::value, float>,
-             cond<std::is_same<T, platform::complex<double>>::value, double>,
+    select_t<cond<std::is_same<T, pten::dtype::complex<float>>::value, float>,
+             cond<std::is_same<T, pten::dtype::complex<double>>::value, double>,
              T>;
 
 template <typename T, typename RealT>
@@ -77,13 +79,13 @@ using NoComplex = typename std::enable_if<std::is_same<T, RealT>::value>::type;
 
 template <typename T>
 using EnableComplex = typename std::enable_if<
-    std::is_same<T, platform::complex<float>>::value ||
-    std::is_same<T, platform::complex<double>>::value>::type;
+    std::is_same<T, pten::dtype::complex<float>>::value ||
+    std::is_same<T, pten::dtype::complex<double>>::value>::type;
 
 template <typename T>
 using DisableComplex = typename std::enable_if<
-    !std::is_same<T, platform::complex<float>>::value &&
-    !std::is_same<T, platform::complex<double>>::value>::type;
+    !std::is_same<T, pten::dtype::complex<float>>::value &&
+    !std::is_same<T, pten::dtype::complex<double>>::value>::type;
 
 template <typename T, typename Enable = void>
 struct RealFunctor;
@@ -154,8 +156,7 @@ struct AbsFunctor<T, NoComplex<T, Real<T>>> {
 
 template <typename T>
 struct AbsGradFunctor {
-  AbsGradFunctor(const math::Real<T>* dout, const T* x, T* output,
-                 int64_t numel)
+  AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -166,52 +167,55 @@ struct AbsGradFunctor {
     }
   }
 
-  const math::Real<T>* dout_;
+  const Real<T>* dout_;
   const T* x_;
   T* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex<float>> {
-  AbsGradFunctor(const float* dout, const paddle::platform::complex<float>* x,
-                 paddle::platform::complex<float>* output, int64_t numel)
+struct AbsGradFunctor<pten::dtype::complex<float>> {
+  AbsGradFunctor(const float* dout,
+                 const pten::dtype::complex<float>* x,
+                 pten::dtype::complex<float>* output,
+                 int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex<float>(0)) {
-      output_[idx] = paddle::platform::complex<float>(0);
+    if (x_[idx] == pten::dtype::complex<float>(0)) {
+      output_[idx] = pten::dtype::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex<float>(dout_[idx]) *
-                     (x_[idx] / paddle::platform::complex<float>(abs(x_[idx])));
+      output_[idx] = pten::dtype::complex<float>(dout_[idx]) *
+                     (x_[idx] / pten::dtype::complex<float>(abs(x_[idx])));
     }
   }
 
   const float* dout_;
-  const paddle::platform::complex<float>* x_;
-  paddle::platform::complex<float>* output_;
+  const pten::dtype::complex<float>* x_;
+  pten::dtype::complex<float>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex<double>> {
-  AbsGradFunctor(const double* dout, const paddle::platform::complex<double>* x,
-                 paddle::platform::complex<double>* output, int64_t numel)
+struct AbsGradFunctor<pten::dtype::complex<double>> {
+  AbsGradFunctor(const double* dout,
+                 const pten::dtype::complex<double>* x,
+                 pten::dtype::complex<double>* output,
+                 int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex<double>(0)) {
-      output_[idx] = paddle::platform::complex<double>(0);
+    if (x_[idx] == pten::dtype::complex<double>(0)) {
+      output_[idx] = pten::dtype::complex<double>(0);
     } else {
-      output_[idx] =
-          paddle::platform::complex<double>(dout_[idx]) *
-          (x_[idx] / paddle::platform::complex<double>(abs(x_[idx])));
+      output_[idx] = pten::dtype::complex<double>(dout_[idx]) *
+                     (x_[idx] / pten::dtype::complex<double>(abs(x_[idx])));
     }
   }
 
   const double* dout_;
-  const paddle::platform::complex<double>* x_;
-  paddle::platform::complex<double>* output_;
+  const pten::dtype::complex<double>* x_;
+  pten::dtype::complex<double>* output_;
   int64_t numel_;
 };
 
@@ -235,46 +239,48 @@ struct AbsGradGradFunctor {
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex<double>> {
-  AbsGradGradFunctor(const paddle::platform::complex<double>* ddx,
-                     const paddle::platform::complex<double>* x,
-                     paddle::platform::complex<double>* output, int64_t numel)
+struct AbsGradGradFunctor<pten::dtype::complex<double>> {
+  AbsGradGradFunctor(const pten::dtype::complex<double>* ddx,
+                     const pten::dtype::complex<double>* x,
+                     pten::dtype::complex<double>* output,
+                     int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex<double>(0)) {
-      output_[idx] = paddle::platform::complex<double>(0);
+    if (x_[idx] == pten::dtype::complex<double>(0)) {
+      output_[idx] = pten::dtype::complex<double>(0);
     } else {
-      output_[idx] = paddle::platform::complex<double>(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex<double>(abs(x_[idx]));
+      output_[idx] = pten::dtype::complex<double>(ddx_[idx]) * x_[idx] /
+                     pten::dtype::complex<double>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex<double>* ddx_;
-  const paddle::platform::complex<double>* x_;
-  paddle::platform::complex<double>* output_;
+  const pten::dtype::complex<double>* ddx_;
+  const pten::dtype::complex<double>* x_;
+  pten::dtype::complex<double>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex<float>> {
-  AbsGradGradFunctor(const paddle::platform::complex<float>* ddx,
-                     const paddle::platform::complex<float>* x,
-                     paddle::platform::complex<float>* output, int64_t numel)
+struct AbsGradGradFunctor<pten::dtype::complex<float>> {
+  AbsGradGradFunctor(const pten::dtype::complex<float>* ddx,
+                     const pten::dtype::complex<float>* x,
+                     pten::dtype::complex<float>* output,
+                     int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex<float>(0)) {
-      output_[idx] = paddle::platform::complex<float>(0);
+    if (x_[idx] == pten::dtype::complex<float>(0)) {
+      output_[idx] = pten::dtype::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex<float>(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex<float>(abs(x_[idx]));
+      output_[idx] = pten::dtype::complex<float>(ddx_[idx]) * x_[idx] /
+                     pten::dtype::complex<float>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex<float>* ddx_;
-  const paddle::platform::complex<float>* x_;
-  paddle::platform::complex<float>* output_;
+  const pten::dtype::complex<float>* ddx_;
+  const pten::dtype::complex<float>* x_;
+  pten::dtype::complex<float>* output_;
   int64_t numel_;
 };
 template <typename T, typename Enable = void>
@@ -318,8 +324,10 @@ struct RealImagToComplexFunctor;
 
 template <typename T>
 struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealImagToComplexFunctor(const Real<T>* input_real, const Real<T>* input_imag,
-                           T* output, int64_t numel)
+  RealImagToComplexFunctor(const Real<T>* input_real,
+                           const Real<T>* input_imag,
+                           T* output,
+                           int64_t numel)
       : input_real_(input_real),
         input_imag_(input_imag),
         output_(output),
@@ -363,6 +371,84 @@ struct ConjFunctor<T, DisableComplex<T>> {
   T* output_;
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template <typename T, typename Enable = void>
+struct AngleFunctor;
+
+// angel function for complex
+template <typename T>
+struct AngleFunctor<T, pten::funcs::Complex<T, pten::funcs::Real<T>>> {
+  AngleFunctor(const T* input, pten::funcs::Real<T>* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = arg(input_[idx]);
+  }
+
+  const T* input_;
+  pten::funcs::Real<T>* output_;
+  int64_t numel_;
+};
+
+// angel function for real
+template <typename T>
+struct AngleFunctor<T, pten::funcs::NoComplex<T, pten::funcs::Real<T>>> {
+  AngleFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = input_[idx] < static_cast<T>(0) ? M_PI : 0;
+  }
+
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Enable = void>
+struct AngleGradFunctor;
+
+// angle grad for complex
+template <typename T>
+struct AngleGradFunctor<T, pten::funcs::Complex<T, pten::funcs::Real<T>>> {
+  AngleGradFunctor(const pten::funcs::Real<T>* dout,
+                   const T* x,
+                   T* dx,
+                   int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    if (x_[idx] == T(0)) {
+      dx_[idx] = T(0);
+    } else {
+      const pten::funcs::Real<T> r_square =
+          x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
+      dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
+                   dout_[idx] * x_[idx].real / r_square);
+    }
+  }
+
+  const pten::funcs::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+
+// angle grad for real
+template <typename T>
+struct AngleGradFunctor<T, pten::funcs::NoComplex<T, pten::funcs::Real<T>>> {
+  AngleGradFunctor(const pten::funcs::Real<T>* dout,
+                   const T* x,
+                   T* dx,
+                   int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
+
+  const pten::funcs::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/pten/kernels/funcs/compound_functors.h
similarity index 86%
rename from paddle/fluid/operators/math/compound_functors.h
rename to paddle/pten/kernels/funcs/compound_functors.h
index 6a43215bf52a9b231a47241d1bb27695da031957..c3d14a50659396345b94a0aaaff2972b5e0fe08e 100644
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/pten/kernels/funcs/compound_functors.h
@@ -18,9 +18,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace pten {
+namespace funcs {
 
 // Z = BinaryFunctor(X, UnaryFunctor(Y))
 template <typename T, typename BinaryFunctor, typename UnaryFunctor>
@@ -69,8 +68,8 @@ struct BinaryCompoundGradDxFunctor {
     return dout * d_binary_fun_.Dx(x, unary_fun_(y));
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
+  inline HOSTDEVICE T
+  UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) {
     return dout * d_binary_fun_.Dx(x, intermediate_out);
   }
 
@@ -82,8 +81,11 @@ struct BinaryCompoundGradDxFunctor {
 };
 
 // Z = BinaryFunctor(X, UnaryFunctor(Y))
-template <typename T, typename DBinaryFun, typename UnaryFun,
-          typename DUnaryFun, bool InPlace>
+template <typename T,
+          typename DBinaryFun,
+          typename UnaryFun,
+          typename DUnaryFun,
+          bool InPlace>
 struct BinaryCompoundGradDyFunctor {
   BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
                               const UnaryFun &unary_fun,
@@ -96,8 +98,8 @@ struct BinaryCompoundGradDyFunctor {
     return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y);
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
+  inline HOSTDEVICE T
+  UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) {
     if (InPlace) {
       return dout * d_binary_fun_.Dy(x, intermediate_out) *
              d_unary_fun_.UseOut(intermediate_out);
@@ -116,8 +118,11 @@ struct BinaryCompoundGradDyFunctor {
 };
 
 // Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename DUnaryFun, typename BinaryFun,
-          typename DBinaryFun, bool InPlace>
+template <typename T,
+          typename DUnaryFun,
+          typename BinaryFun,
+          typename DBinaryFun,
+          bool InPlace>
 struct UnaryCompoundGradDxFunctor {
   UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
                              const BinaryFun &binary_fun,
@@ -136,8 +141,8 @@ struct UnaryCompoundGradDxFunctor {
     return base * d_binary_fun_.Dx(x, y);
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
+  inline HOSTDEVICE T
+  UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) {
     T base;
     if (InPlace) {
       base = dout * d_unary_fun_.UseOut(out);
@@ -156,8 +161,11 @@ struct UnaryCompoundGradDxFunctor {
 };
 
 // Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename DUnaryFun, typename BinaryFun,
-          typename DBinaryFun, bool InPlace>
+template <typename T,
+          typename DUnaryFun,
+          typename BinaryFun,
+          typename DBinaryFun,
+          bool InPlace>
 struct UnaryCompoundGradDyFunctor {
   UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
                              const BinaryFun &binary_fun,
@@ -176,8 +184,8 @@ struct UnaryCompoundGradDyFunctor {
     return base * d_binary_fun_.Dy(x, y);
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
+  inline HOSTDEVICE T
+  UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) {
     T base;
     if (InPlace) {
       base = dout * d_unary_fun_.UseOut(out);
@@ -206,7 +214,9 @@ struct BinaryCompoundGradDIntermedaiteOutFunctor {
     return dout * d_binary_fun_.Dy(x, unary_fun_(y));
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
+  inline HOSTDEVICE T UseIntermediateOut(T x,
+                                         T intermediate_out,
+                                         T out,
                                          T dout) {
     return dout * d_binary_fun_.Dy(x, intermediate_out);
   }
@@ -233,7 +243,9 @@ struct UnaryCompoundGradDIntermediateFunctor {
     }
   }
 
-  inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
+  inline HOSTDEVICE T UseIntermediateOut(T x,
+                                         T intermediate_out,
+                                         T out,
                                          T dout) {
     if (InPlace) {
       return dout * d_unary_fun_.UseOut(out);
@@ -249,6 +261,5 @@ struct UnaryCompoundGradDIntermediateFunctor {
   BinaryFun binary_fun_;
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/pten/kernels/funcs/functors.h
similarity index 85%
rename from paddle/fluid/operators/math/functors.h
rename to paddle/pten/kernels/funcs/functors.h
index 054018b10e87e421c45846abf550f0f7a552f6a3..8b2bdfd0b1e32b38c0a9500b67dfa452bcaee97e 100644
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/pten/kernels/funcs/functors.h
@@ -17,16 +17,17 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
-
-// MulFunctor
-template <typename T>
-struct MulFunctor {
-  // out = x * y;
-  inline HOSTDEVICE T operator()(T x, T y) { return x * y; }
-};
+namespace pten {
+namespace funcs {
+
+// // MulFunctor
+// // NOTE(chenfeiyu): IT IS NOLONGER USED, use pten::funcs::MultiplyFunctor
+// instead
+// template <typename T>
+// struct MulFunctor {
+//   // out = x * y;
+//   inline HOSTDEVICE T operator()(T x, T y) { return x * y; }
+// };
 
 template <typename T>
 struct MulGradFunctor {
@@ -34,12 +35,13 @@ struct MulGradFunctor {
   inline HOSTDEVICE T Dy(T x, T y) { return x; }
 };
 
-// AddFunctor
-template <typename T>
-struct AddFunctor {
-  // out = x + y;
-  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
-};
+// // AddFunctor
+// // NOTE(chenfeiyu): IT IS NOLONGER USED, use pten::funcs::AddFunctor instead
+// template <typename T>
+// struct AddFunctor {
+//   // out = x + y;
+//   inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+// };
 
 template <typename T>
 struct MaxFunctor {
@@ -102,7 +104,8 @@ struct TanhFunctor {
     // y = 2 / (1 + e^-2x) - 1
     T t0 = static_cast<T>(2) * x;
     T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) / (static_cast<T>(1) + real_exp(-t1)) -
+    return static_cast<T>(2) /
+               (static_cast<T>(1) + paddle::operators::real_exp(-t1)) -
            static_cast<T>(1);
   }
 };
@@ -123,7 +126,8 @@ struct SigmoidFunctor {
   inline HOSTDEVICE T operator()(T x) {
     // y = 1 / (1 + e^-x)
     T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) / (static_cast<T>(1) + real_exp(-tmp));
+    return static_cast<T>(1) /
+           (static_cast<T>(1) + paddle::operators::real_exp(-tmp));
   }
 };
 
@@ -138,7 +142,7 @@ struct SigmoidGradFunctor {
 
 template <typename T>
 struct GeluFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T x) {
     // this function is tanh approximation of gelu
     // actual gelu is:
@@ -154,7 +158,7 @@ struct GeluFunctor {
 
 template <typename T>
 struct GeluGradFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T UseX(T x) {
     MT mx = static_cast<MT>(x);
     MT tanh_out =
@@ -193,6 +197,5 @@ struct GeluGradFunctor {
   }
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc
index 780068e0381aa87221dadc4b79bb8edb2fdf3842..dec89e79565dea863b1f2837334db372ed415522 100644
--- a/paddle/pten/kernels/funcs/math_function.cc
+++ b/paddle/pten/kernels/funcs/math_function.cc
@@ -215,14 +215,21 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
       paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<paddle::platform::CustomPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<paddle::platform::CPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  paddle::framework::VisitDataType(
-      paddle::framework::TransToProtoVarType(tensor->type()),
-      TensorSetConstantCPU(tensor, value));
+  pten::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
 
 template <>
@@ -239,9 +246,7 @@ void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  paddle::framework::VisitDataType(
-      paddle::framework::TransToProtoVarType(tensor->type()),
-      TensorSetConstantCPU(tensor, value));
+  pten::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu
index f7cee12b2dfd42c2296a4bd30a739bfe181efb13..8ed72dbd1c1278d320ccebfd7463e83f7c101065 100644
--- a/paddle/pten/kernels/funcs/math_function.cu
+++ b/paddle/pten/kernels/funcs/math_function.cu
@@ -226,9 +226,8 @@ void set_constant_with_place<paddle::platform::CUDAPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  paddle::framework::VisitDataType(
-      paddle::framework::TransToProtoVarType(tensor->type()),
-      TensorSetConstantGPU(context, tensor, value));
+  pten::VisitDataType(tensor->dtype(),
+                      TensorSetConstantGPU(context, tensor, value));
 }
 
 template <typename T>
diff --git a/paddle/pten/kernels/funcs/math_function.h b/paddle/pten/kernels/funcs/math_function.h
index 73b9dd00bc64095ea2796154ff5d32c407fd9f1b..14f5b5b41489d09e53a47a1ece22d394c22f1c53 100644
--- a/paddle/pten/kernels/funcs/math_function.h
+++ b/paddle/pten/kernels/funcs/math_function.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/utils/data_type.h"
 
 namespace pten {
 namespace funcs {
diff --git a/paddle/pten/kernels/funcs/math_function_impl.h b/paddle/pten/kernels/funcs/math_function_impl.h
index 19f3082c05cc27c265fe1354fba666226b88ce1c..a66692363572adf06a0d064fbdf9c82e44eb6d6a 100644
--- a/paddle/pten/kernels/funcs/math_function_impl.h
+++ b/paddle/pten/kernels/funcs/math_function_impl.h
@@ -30,8 +30,8 @@ void SetConstant<DeviceContext, T>::operator()(
 #ifdef PADDLE_WITH_XPU
   if (paddle::platform::is_xpu_place(context.GetPlace())) {
     xpu_place = true;
-    paddle::framework::VisitDataType(
-        paddle::framework::TransToProtoVarType(tensor->type()),
+    pten::VisitDataType(
+        tensor->dtype(),
         TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
   }
 #endif
diff --git a/paddle/pten/kernels/gpu/abs_grad_kernel.cu b/paddle/pten/kernels/gpu/abs_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7257e129ec47ac0b5b33923832855b7907cb719
--- /dev/null
+++ b/paddle/pten/kernels/gpu/abs_grad_kernel.cu
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/abs_grad_kernel.h"
+#include "paddle/pten/kernels/impl/abs_grad_kernel_impl.h"
+
+using pten::dtype::complex;
+
+PT_REGISTER_KERNEL(abs_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AbsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   pten::dtype::float16,
+                   complex<float>,
+                   complex<double>) {}
+PT_REGISTER_KERNEL(abs_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AbsDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   pten::dtype::float16,
+                   complex<float>,
+                   complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/abs_kernel.cu b/paddle/pten/kernels/gpu/abs_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06eff050674c3670a2aa07cb43d0baea82fe7202
--- /dev/null
+++ b/paddle/pten/kernels/gpu/abs_kernel.cu
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/abs_kernel.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
+
+namespace pten {
+
+template <typename T, typename Enable = void>
+struct CudaAbsFunctor;
+
+template <typename T>
+struct CudaAbsFunctor<T, pten::funcs::Complex<T, pten::funcs::Real<T>>> {
+  __device__ __forceinline__ pten::funcs::Real<T> operator()(const T x) const {
+    return abs(x);
+  }
+};
+
+template <typename T>
+struct CudaAbsFunctor<T, pten::funcs::NoComplex<T, pten::funcs::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T x) const {
+    return std::abs(x);
+  }
+};
+
+template <typename T, typename Context>
+void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<pten::funcs::Real<T>>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = CudaAbsFunctor<T>();
+
+  funcs::LaunchSameDimsElementwiseCudaKernel<pten::funcs::Real<T>>(
+      ctx, ins, &outs, functor);
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(abs,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AbsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   pten::dtype::float16,
+                   pten::dtype::complex<float>,
+                   pten::dtype::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/histogram_kernel.cu b/paddle/pten/kernels/gpu/histogram_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d0da49e01aebbff375015ddfd7bc90309f9e4d8
--- /dev/null
+++ b/paddle/pten/kernels/gpu/histogram_kernel.cu
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/funcs/math_function.h"
+#include "paddle/pten/kernels/histogram_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+
+namespace pten {
+
+using IndexType = int64_t;
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename IndexType>
+__device__ static IndexType GetBin(T input_value,
+                                   T min_value,
+                                   T max_value,
+                                   int64_t nbins) {
+  IndexType bin = static_cast<int>((input_value - min_value) * nbins /
+                                   (max_value - min_value));
+  IndexType output_index = bin < nbins - 1 ? bin : nbins - 1;
+  return output_index;
+}
+
+template <typename T, typename IndexType>
+__global__ void KernelHistogram(const T* input,
+                                const int total_elements,
+                                const int64_t nbins,
+                                const T min_value,
+                                const T max_value,
+                                int64_t* output) {
+  extern __shared__ int64_t buf_hist[];
+  for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
+    buf_hist[i] = 0;
+  }
+  __syncthreads();
+
+  CUDA_KERNEL_LOOP(input_index, total_elements) {
+    // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto input_value = input[input_index];
+    if (input_value >= min_value && input_value <= max_value) {
+      const IndexType output_index =
+          GetBin<T, IndexType>(input_value, min_value, max_value, nbins);
+      paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1);
+    }
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
+    paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]);
+  }
+}
+
+template <typename T, typename Context>
+void HistogramKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     int64_t bins,
+                     int min,
+                     int max,
+                     DenseTensor* output) {
+  auto& nbins = bins;
+  auto& minval = min;
+  auto& maxval = max;
+
+  const T* input_data = input.data<T>();
+  const int input_numel = input.numel();
+
+  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  pten::funcs::SetConstant<Context, int64_t>()(
+      dev_ctx, output, static_cast<int64_t>(0));
+
+  if (input_data == nullptr) return;
+
+  T output_min = static_cast<T>(minval);
+  T output_max = static_cast<T>(maxval);
+
+  if (output_min == output_max) {
+    auto input_x = pten::EigenVector<T>::Flatten(input);
+
+    DenseTensor input_min_t, input_max_t;
+    auto* input_min_data = input_min_t.mutable_data<T>({1}, dev_ctx.GetPlace());
+    auto* input_max_data = input_max_t.mutable_data<T>({1}, dev_ctx.GetPlace());
+    auto input_min_scala = pten::EigenScalar<T>::From(input_min_t);
+    auto input_max_scala = pten::EigenScalar<T>::From(input_max_t);
+
+    auto* place = dev_ctx.eigen_device();
+    input_min_scala.device(*place) = input_x.minimum();
+    input_max_scala.device(*place) = input_x.maximum();
+
+    DenseTensor input_min_cpu, input_max_cpu;
+    paddle::framework::TensorCopySync(
+        input_min_t, paddle::platform::CPUPlace(), &input_min_cpu);
+    paddle::framework::TensorCopySync(
+        input_max_t, paddle::platform::CPUPlace(), &input_max_cpu);
+
+    output_min = input_min_cpu.data<T>()[0];
+    output_max = input_max_cpu.data<T>()[0];
+  }
+  if (output_min == output_max) {
+    output_min = output_min - 1;
+    output_max = output_max + 1;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      (std::isinf(static_cast<float>(output_min)) ||
+       std::isnan(static_cast<float>(output_max)) ||
+       std::isinf(static_cast<float>(output_min)) ||
+       std::isnan(static_cast<float>(output_max))),
+      false,
+      pten::errors::OutOfRange("range of min, max is not finite"));
+  PADDLE_ENFORCE_GE(
+      output_max,
+      output_min,
+      pten::errors::InvalidArgument(
+          "max must be larger or equal to min. If min and max are both zero, "
+          "the minimum and maximum values of the data are used. "
+          "But received max is %d, min is %d",
+          maxval,
+          minval));
+
+  auto stream = dev_ctx.stream();
+  KernelHistogram<T, IndexType><<<GET_BLOCKS(input_numel),
+                                  PADDLE_CUDA_NUM_THREADS,
+                                  nbins * sizeof(int64_t),
+                                  stream>>>(
+      input_data, input_numel, nbins, output_min, output_max, out_data);
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(histogram,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::HistogramKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/lerp_op.cu b/paddle/pten/kernels/gpu/lerp_grad_kernel.cu
similarity index 54%
rename from paddle/fluid/operators/lerp_op.cu
rename to paddle/pten/kernels/gpu/lerp_grad_kernel.cu
index 6f7d8b744d694f0cd5cbc9bb218034be435ba6f0..30fdb1206f45e5ffd68b3ab75c9bbc065d458f8e 100644
--- a/paddle/fluid/operators/lerp_op.cu
+++ b/paddle/pten/kernels/gpu/lerp_grad_kernel.cu
@@ -12,16 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lerp_op.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/lerp_grad_kernel_impl.h"
+#include "paddle/pten/kernels/lerp_grad_kernel.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    lerp,
-    paddle::operators::LerpKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::LerpKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    lerp_grad,
-    paddle::operators::LerpGradKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    paddle::operators::LerpGradKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
+PT_REGISTER_KERNEL(
+    lerp_grad, GPU, ALL_LAYOUT, pten::LerpGradKernel, float, double) {}
diff --git a/paddle/pten/kernels/gpu/lerp_kernel.cu b/paddle/pten/kernels/gpu/lerp_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8743cb12e491b1a6fa96d44b115304a9b8b1c7c9
--- /dev/null
+++ b/paddle/pten/kernels/gpu/lerp_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/lerp_kernel_impl.h"
+#include "paddle/pten/kernels/lerp_kernel.h"
+
+PT_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, pten::LerpKernel, float, double) {}
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
index 31c44673f94e737bd94882b2537ddf3fababf226..7df99260aa1614a29325ed1d0834400566e28139 100644
--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -26,6 +26,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    float,
                    double,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
 
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
index f9fdbd27bf94e4b236efe5a49e471e39c4c57dd5..b365581e949c103be511e4849a45b4fd9a024f77 100644
--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -27,5 +27,6 @@ PT_REGISTER_KERNEL(matmul,
                    float,
                    double,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/histogram_kernel.h b/paddle/pten/kernels/histogram_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bc4ef6fb9e4657305f4f967371711a0aaabb035
--- /dev/null
+++ b/paddle/pten/kernels/histogram_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+namespace pten {
+
+template <typename T, typename Context>
+void HistogramSelectKernel(const Context& dev_ctx,
+                           const DenseTensor& input,
+                           int64_t bins,
+                           int min,
+                           int max,
+                           DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/abs_grad_kernel_impl.h b/paddle/pten/kernels/impl/abs_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff829e10b2d8bf06971173978627632bf18fa93f
--- /dev/null
+++ b/paddle/pten/kernels/impl/abs_grad_kernel_impl.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/abs_grad_kernel.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void AbsGradKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dout_data = dout.data<pten::funcs::Real<T>>();
+  auto* x_data = x.data<T>();
+
+  ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+  auto* dx_data = dx->data<T>();
+
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  pten::funcs::AbsGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+void AbsDoubleGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& ddx,
+                         DenseTensor* ddout) {
+  auto numel = ddx.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* x_data = x.data<T>();
+  ctx.template Alloc<T>(ddout, static_cast<size_t>(numel * sizeof(T)));
+  auto* ddout_data = ddout->data<T>();
+
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  pten::funcs::AbsGradGradFunctor<T> functor(
+      ddx_data, x_data, ddout_data, numel);
+  for_range(functor);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
index 7e4c4f0d66d4fc89634eb7bde9eb24e2743d4a7c..17cfb886e57b813fa744ebc232d6cc38e6b0f951 100644
--- a/paddle/pten/kernels/impl/complex_kernel_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace pten {
 
@@ -29,7 +29,7 @@ void ConjKernel(const Context& dev_ctx,
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
   paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
-  paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
+  pten::funcs::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
 
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
index d4ea9fc944527145269fdfd1a854aca1299a6018..4ed47bd69dd5f6b37b179cc9534fde64f949b5de 100644
--- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/pten/kernels/complex_kernel.h"
 
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 namespace pten {
 
@@ -35,9 +35,7 @@ struct DotGradFunction {
 };
 
 template <typename DeviceContext, typename T>
-struct DotGradFunction<DeviceContext,
-                       T,
-                       paddle::operators::math::EnableComplex<T>> {
+struct DotGradFunction<DeviceContext, T, pten::funcs::EnableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* tensor_x,
                   const DenseTensor* tensor_y,
@@ -133,9 +131,7 @@ struct DotGradFunction<DeviceContext,
 };
 
 template <typename DeviceContext, typename T>
-struct DotGradFunction<DeviceContext,
-                       T,
-                       paddle::operators::math::DisableComplex<T>> {
+struct DotGradFunction<DeviceContext, T, pten::funcs::DisableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* tensor_x,
                   const DenseTensor* tensor_y,
@@ -221,9 +217,7 @@ struct DotDoubleGradFunction {
 };
 
 template <typename DeviceContext, typename T>
-struct DotDoubleGradFunction<DeviceContext,
-                             T,
-                             paddle::operators::math::EnableComplex<T>> {
+struct DotDoubleGradFunction<DeviceContext, T, pten::funcs::EnableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* tensor_x,
                   const DenseTensor* tensor_y,
@@ -334,9 +328,7 @@ struct DotDoubleGradFunction<DeviceContext,
 };
 
 template <typename DeviceContext, typename T>
-struct DotDoubleGradFunction<DeviceContext,
-                             T,
-                             paddle::operators::math::DisableComplex<T>> {
+struct DotDoubleGradFunction<DeviceContext, T, pten::funcs::DisableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* tensor_x,
                   const DenseTensor* tensor_y,
@@ -461,9 +453,7 @@ struct DotTripleGradFunction {
 // TODO(wuweilong): enable this function when the unittests framewark for multi
 // grad is ok (dtype: complex64 or complex128).
 template <typename DeviceContext, typename T>
-struct DotTripleGradFunction<DeviceContext,
-                             T,
-                             paddle::operators::math::EnableComplex<T>> {
+struct DotTripleGradFunction<DeviceContext, T, pten::funcs::EnableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* in_tensor_x,
                   const DenseTensor* in_tensor_y,
@@ -656,9 +646,7 @@ struct DotTripleGradFunction<DeviceContext,
 };
 
 template <typename DeviceContext, typename T>
-struct DotTripleGradFunction<DeviceContext,
-                             T,
-                             paddle::operators::math::DisableComplex<T>> {
+struct DotTripleGradFunction<DeviceContext, T, pten::funcs::DisableComplex<T>> {
   void operator()(const DeviceContext& ctx,
                   const DenseTensor* in_tensor_x,
                   const DenseTensor* in_tensor_y,
diff --git a/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h b/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5285c69e39a17d5971212e83d85c095bd14ed873
--- /dev/null
+++ b/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/kernels/funcs/common_shape.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+
+namespace pten {
+
+template <typename Context, typename T, size_t D>
+static void LerpGradFunction(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& weight,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             DenseTensor* x_grad,
+                             DenseTensor* y_grad) {
+  auto& w = weight;
+  auto& dout = out_grad;
+  auto* dx = x_grad;
+  auto* dy = y_grad;
+
+  auto dout_dims = dout.dims();
+  auto dx_dims = pten::funcs::ExtendDims2Rank(dx->dims(), D);
+  auto dy_dims = pten::funcs::ExtendDims2Rank(dy->dims(), D);
+  auto w_dims = pten::funcs::ExtendDims2Rank(w.dims(), D);
+  Eigen::DSizes<int, D> dx_bcast_dims;
+  Eigen::DSizes<int, D> dy_bcast_dims;
+  Eigen::DSizes<int, D> w_bcast_dims;
+  pten::funcs::GetBroadcastDims<D>(dx_dims, dout_dims, &dx_bcast_dims);
+  pten::funcs::GetBroadcastDims<D>(dy_dims, dout_dims, &dy_bcast_dims);
+  pten::funcs::GetBroadcastDims<D>(w_dims, dout_dims, &w_bcast_dims);
+
+  auto eigen_w = pten::EigenTensor<T, D>::From(w, w_dims);
+  auto eigen_dout = pten::EigenTensor<T, D>::From(dout);
+
+  Eigen::DSizes<int, D * 2> dx_reshape_dims;
+  Eigen::DSizes<int, D * 2> dy_reshape_dims;
+  Eigen::DSizes<int, D> reduce_dims;
+  for (int i = 0; i < dout_dims.size(); ++i) {
+    dx_reshape_dims[2 * i] = dx_bcast_dims[i];
+    dx_reshape_dims[2 * i + 1] = dx_dims[i];
+    dy_reshape_dims[2 * i] = dy_bcast_dims[i];
+    dy_reshape_dims[2 * i + 1] = dy_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  auto& place = *ctx.eigen_device();
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    auto eigen_dx = pten::EigenTensor<T, D>::From(*dx, dx_dims);
+    auto eigen_expr = (1 - eigen_w.broadcast(w_bcast_dims)) * eigen_dout;
+    eigen_dx.device(place) = eigen_expr.reshape(dx_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(eigen_dx.dimensions());
+  }
+  if (dy) {
+    ctx.template Alloc<T>(dy);
+    auto eigen_dy = pten::EigenTensor<T, D>::From(*dy, dy_dims);
+    auto eigen_expr = eigen_w.broadcast(w_bcast_dims) * eigen_dout;
+    eigen_dy.device(place) = eigen_expr.reshape(dy_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(eigen_dy.dimensions());
+  }
+}
+
+template <typename T, typename Context>
+void LerpGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& weight,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  int rank = out.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      pten::errors::InvalidArgument(
+          "The number of dimensions for LerpGradOp must be "
+          "greater than or equal to 1, but the value received is %d.",
+          rank));
+  PADDLE_ENFORCE_LE(
+      rank,
+      6,
+      pten::errors::InvalidArgument(
+          "The number of dimensions for LerpGradOp must be "
+          "less than or equal to 6, but the value received is %d.",
+          rank));
+  switch (rank) {
+    case 1:
+      LerpGradFunction<Context, T, 1>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+    case 2:
+      LerpGradFunction<Context, T, 2>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+    case 3:
+      LerpGradFunction<Context, T, 3>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+    case 4:
+      LerpGradFunction<Context, T, 4>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+    case 5:
+      LerpGradFunction<Context, T, 5>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+    case 6:
+      LerpGradFunction<Context, T, 6>(
+          ctx, x, y, weight, out, out_grad, x_grad, y_grad);
+      break;
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/lerp_kernel_impl.h b/paddle/pten/kernels/impl/lerp_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..127e3e50a3651dd9cc998b9920de17566bc996ba
--- /dev/null
+++ b/paddle/pten/kernels/impl/lerp_kernel_impl.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/common_shape.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+
+namespace pten {
+
+template <typename Context, typename T, size_t D>
+static void LerpFunction(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& weight,
+                         DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  auto out_dims = out->dims();
+  auto x_dims = pten::funcs::ExtendDims2Rank(x.dims(), D);
+  auto y_dims = pten::funcs::ExtendDims2Rank(y.dims(), D);
+  auto w_dims = pten::funcs::ExtendDims2Rank(weight.dims(), D);
+  Eigen::DSizes<int, D> x_bcast_dims;
+  Eigen::DSizes<int, D> y_bcast_dims;
+  Eigen::DSizes<int, D> w_bcast_dims;
+  pten::funcs::GetBroadcastDims<D>(x_dims, out_dims, &x_bcast_dims);
+  pten::funcs::GetBroadcastDims<D>(y_dims, out_dims, &y_bcast_dims);
+  pten::funcs::GetBroadcastDims<D>(w_dims, out_dims, &w_bcast_dims);
+
+  auto eigen_x = pten::EigenTensor<T, D>::From(x, x_dims);
+  auto eigen_y = pten::EigenTensor<T, D>::From(y, y_dims);
+  auto eigen_w = pten::EigenTensor<T, D>::From(weight, w_dims);
+  auto eigen_out = pten::EigenTensor<T, D>::From(*out);
+
+  auto& place = *ctx.eigen_device();
+  eigen_out.device(place) =
+      eigen_x.broadcast(x_bcast_dims) +
+      eigen_w.broadcast(w_bcast_dims) *
+          (eigen_y.broadcast(y_bcast_dims) - eigen_x.broadcast(x_bcast_dims));
+}
+
+template <typename T, typename Context>
+void LerpKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                const DenseTensor& weight,
+                DenseTensor* out) {
+  int rank = out->dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      pten::errors::InvalidArgument(
+          "The number of dimensions for LerpOp must be "
+          "greater than or equal to 1, but the value received is %d.",
+          rank));
+  PADDLE_ENFORCE_LE(
+      rank,
+      6,
+      pten::errors::InvalidArgument(
+          "The number of dimensions for LerpOp must be "
+          "less than or equal to 6, but the value received is %d.",
+          rank));
+  switch (rank) {
+    case 1:
+      LerpFunction<Context, T, 1>(ctx, x, y, weight, out);
+      break;
+    case 2:
+      LerpFunction<Context, T, 2>(ctx, x, y, weight, out);
+      break;
+    case 3:
+      LerpFunction<Context, T, 3>(ctx, x, y, weight, out);
+      break;
+    case 4:
+      LerpFunction<Context, T, 4>(ctx, x, y, weight, out);
+      break;
+    case 5:
+      LerpFunction<Context, T, 5>(ctx, x, y, weight, out);
+      break;
+    case 6:
+      LerpFunction<Context, T, 6>(ctx, x, y, weight, out);
+      break;
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index 858807a1d4d6496d5e3091aa71f5b2dada03b92e..addea622f140210ae714da2eda775f6ce6568eca 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/pten/kernels/funcs/complex_functors.h"
 
 #include "paddle/pten/core/dense_tensor.h"
 
diff --git a/paddle/pten/kernels/lerp_grad_kernel.h b/paddle/pten/kernels/lerp_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..18a38e724505f214e29fd4d18f187f0c59012700
--- /dev/null
+++ b/paddle/pten/kernels/lerp_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void LerpGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& weight,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/lerp_kernel.h b/paddle/pten/kernels/lerp_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e70a53c06b5cb3930f89c241a8bfd82a511a6b6
--- /dev/null
+++ b/paddle/pten/kernels/lerp_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void LerpKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                const DenseTensor& weight,
+                DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/selected_rows/scale_kernel.cc b/paddle/pten/kernels/selected_rows/scale_kernel.cc
index 09700d8afe0508e51cbdaff8404d97c4e25f5b9d..32f7a41a5b9688710450713a4b96c68906d26ad5 100644
--- a/paddle/pten/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,10 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/core/kernel_registry.h"
 namespace pten {
 
 template <typename T, typename Context>
diff --git a/paddle/pten/ops/compat/abs_sig.cc b/paddle/pten/ops/compat/abs_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a610db46a16f4da0df1ebbf2cd0d5fda174cde50
--- /dev/null
+++ b/paddle/pten/ops/compat/abs_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature AbsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("abs", {"X"}, {}, {"Out"});
+}
+
+KernelSignature AbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "abs_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+KernelSignature AbsDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("abs_double_grad", {"X", "DDX"}, {}, {"DDOut"});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(abs, pten::AbsOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(abs_grad, pten::AbsGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(abs_double_grad,
+                           pten::AbsDoubleGradOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/elementwise_sig.cc b/paddle/pten/ops/compat/elementwise_sig.cc
index c1941f6dde30baca60c3647ca0e2267c8a0d65f1..6541334ee27ec21d92ebcab67af1186bafadbfb2 100644
--- a/paddle/pten/ops/compat/elementwise_sig.cc
+++ b/paddle/pten/ops/compat/elementwise_sig.cc
@@ -75,6 +75,31 @@ KernelSignature ElementwiseAddGradOpArgumentMapping(
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+}
+
+KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("add_triple_grad",
+                         {"DDX", "DDY", "D_DDOut"},
+                         {"axis"},
+                         {"D_DDX", "D_DDY"});
+}
+
+KernelSignature ElementwiseSubGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("subtract_grad",
+                           {"X", "Y", GradVarName("Out")},
+                           {"axis"},
+                           {GradVarName("X"), GradVarName("Y")});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 }  // namespace pten
 
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -82,6 +107,9 @@ PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 
 PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            pten::ElementwiseAddOpArgumentMapping);
@@ -93,3 +121,9 @@ PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
                            pten::ElementwiseDivOpArgumentMapping);
 PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                            pten::ElementwiseAddGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
+                           pten::ElementwiseAddDoubleGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
+                           pten::ElementwiseAddTripleGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
+                           pten::ElementwiseSubGradOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/histogram_sig.cc b/paddle/pten/ops/compat/histogram_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9849c998d779e46bb955f0bc98686c247fc99b18
--- /dev/null
+++ b/paddle/pten/ops/compat/histogram_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("histogram", {"X"}, {"bins", "min", "max"}, {"Out"});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(histogram, pten::HistogramOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/lerp_sig.cc b/paddle/pten/ops/compat/lerp_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d225ff2bfd3e2bd68130c6d8a14b71df0069c5c5
--- /dev/null
+++ b/paddle/pten/ops/compat/lerp_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature LerpOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("lerp", {"X", "Y", "Weight"}, {}, {"Out"});
+}
+
+KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("lerp_grad",
+                         {"X", "Y", "Weight", "Out", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(lerp, pten::LerpOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(lerp_grad, pten::LerpGradOpArgumentMapping);
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 32e6e0784dad0c716cfea384b46933f11adbe5d0..971d9112eead97f46ab1f165c9073ac525464676 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -1,7 +1,6 @@
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
-cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
 cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
 cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor)
 cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor)
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 711b8811b973c7172af5733c70efd46cd6f25e77..35b2ce751b18fff2aac8dedfd09e5fe209d95533 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -91,6 +91,7 @@ if "%WITH_PYTHON%" == "ON" (
     where pip
     pip install wheel --user
     pip install pyyaml --user
+    pip install wget --user
     pip install -r %work_dir%\python\requirements.txt --user
     if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!
@@ -175,19 +176,20 @@ rem -------Caching strategy 1: End --------------------------------
 
 
 rem -------Caching strategy 2: sccache decorate compiler-----------
+if not defined SCCACHE_ROOT set SCCACHE_ROOT=D:\sccache
 if "%WITH_SCCACHE%"=="ON" (
-    del D:\sccache\sccache_log.txt
     cmd /C sccache -V || call :install_sccache
     sccache --stop-server 2> NUL
+    del %SCCACHE_ROOT%\sccache_log.txt
 
     :: Localy storage on windows
-    if not exist D:\sccache mkdir D:\sccache
-    set SCCACHE_DIR=D:\sccache\.cache
+    if not exist %SCCACHE_ROOT% mkdir %SCCACHE_ROOT%
+    set SCCACHE_DIR=%SCCACHE_ROOT%\.cache
     
     :: Sccache will shut down if a source file takes more than 10 mins to compile
     set SCCACHE_IDLE_TIMEOUT=0
     set SCCACHE_CACHE_SIZE=100G
-    set SCCACHE_ERROR_LOG=D:\sccache\sccache_log.txt
+    set SCCACHE_ERROR_LOG=%SCCACHE_ROOT%\sccache_log.txt
     set SCCACHE_LOG=quiet
 
     :: Distributed storage on windows
@@ -208,7 +210,7 @@ if "%WITH_SCCACHE%"=="ON" (
 echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')"
-xcopy sccache.exe %PYTHON_ROOT%\Scripts\ /Y
+xcopy sccache.exe %PYTHON_ROOT%\ /Y
 goto:eof
 rem -------Caching strategy 2: End --------------------------------
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8ce9716b169b9c64b82d66b949609ff502775942..12d31aee41e394968d58753f2b54fcce8648a35e 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -293,6 +293,7 @@ from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import MLUPlace  # noqa: F401
+from .framework import CustomPlace  # noqa: F401
 
 from .autograd import grad  # noqa: F401
 from .autograd import no_grad  # noqa: F401
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index d102473fef791124e0605008dd1844507c3b4a61..89e0ae49fc48f73840129826952a01aec07dd3ab 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -36,7 +36,11 @@ __all__ = [  # noqa
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_npu',
-    'is_compiled_with_mlu'
+    'is_compiled_with_mlu',
+    'get_all_device_type',
+    'get_all_custom_device_type',
+    'get_available_device',
+    'get_available_custom_device',
 ]
 
 _cudnn_version = None
@@ -225,15 +229,26 @@ def _convert_to_place(device):
         selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
         device_id = int(selected_mlus[0])
         place = core.MLUPlace(device_id)
+    elif device in core.get_all_custom_device_type():
+        place = core.CustomPlace(device, 0)
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
         avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
         if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
-            raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu"
-            )
+            device_info_list = device.split(':', 1)
+            device_type = device_info_list[0]
+            if device_type in core.get_all_custom_device_type():
+                device_id = device_info_list[1]
+                device_id = int(device_id)
+                place = core.CustomPlace(device_type, device_id)
+            else:
+                raise ValueError(
+                    "The device must be a string which is like 'cpu', {}".
+                    format(', '.join("'{}', '{}:x'".format(x, x)
+                                     for x in ['gpu', 'xpu', 'npu', 'mlu'] +
+                                     core.get_all_custom_device_type())))
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
                 raise ValueError(
@@ -338,3 +353,103 @@ def get_device():
         raise ValueError("The device specification {} is invalid".format(place))
 
     return device
+
+
+def get_all_device_type():
+    """
+    Get all available device types.
+
+    Returns:
+        A list of all available device types.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_all_device_type()
+
+            # Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
+            # Output: ['cpu']
+
+            # Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: ['cpu', 'gpu']
+
+            # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
+            # Output: ['cpu', 'CustomCPU']
+
+            # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['cpu', 'gpu', 'CustomCPU', 'CustomGPU']
+    """
+    return core.get_all_device_type()
+
+
+def get_all_custom_device_type():
+    """
+    Get all available custom device types.
+
+    Returns: 
+        A list of all available custom device types.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_all_custom_device_type()
+
+            # Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: None
+
+            # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['CustomCPU', 'CustomGPU']
+    """
+    return core.get_all_custom_device_type()
+
+
+def get_available_device():
+    """
+    Get all available devices.
+
+    Returns:
+        A list of all available devices.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_available_device()
+
+            # Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
+            # Output: ['cpu']
+
+            # Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: ['cpu', 'gpu:0', 'gpu:1']
+
+            # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
+            # Output: ['cpu', 'CustomCPU']
+
+            # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['cpu', 'gpu:0', 'gpu:1', 'CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
+    """
+    return core.get_available_device()
+
+
+def get_available_custom_device():
+    """
+    Get all available custom devices.
+
+    Returns:
+       A list of all available custom devices.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_available_custom_device()
+
+            # Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: None
+
+            # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
+    """
+    return core.get_available_custom_device()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 59a92930d22a91f752421d1bf0f64e1f38f12e02..bc50bef010941a48c367046221d17c75138753c2 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -54,14 +54,15 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['user_defined_strategy'] = self.user_defined_strategy
         attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
         attrs['ps_mode'] = attrs['trainer'].mode
-
+        logger.info("ps_mode: {}".format(attrs['ps_mode']))
         attrs['role_maker'] = self.role_maker
         attrs[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
         attrs['is_worker'] = self.role_maker._is_worker()
         attrs['is_server'] = self.role_maker._is_server()
         attrs['is_heter_worker'] = self.role_maker._is_heter_worker()
-
+        logger.info("this process is heter? {}".format(attrs[
+            'is_heter_worker']))
         attrs['use_ps_gpu'] = self.user_defined_strategy.a_sync_configs[
             "use_ps_gpu"]
         attrs['lr_decay_steps'] = self.user_defined_strategy.a_sync_configs[
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index fff10a2d4684afe51295cc460f8dc3424d13c4f5..3f39db69abdb2930ec40ffb02cb34dce7be6a034 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -47,7 +47,7 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
         if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
             dummy_output = program.global_block().create_var(
                 name=framework.generate_control_dev_var_name())
-
+        logger.info("dummy_output: {}".format(dummy_output))
         program.global_block().append_op(
             type="send",
             inputs={"X": send_input_vars},
@@ -61,7 +61,7 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
 
         return dummy_output
 
-    def _append_barrier_op(self, program, dummys):
+    def _append_barrier_op(self, program, dummys, trainer_id):
         program.global_block().append_op(
             type="send_barrier",
             inputs={"X": dummys},
@@ -79,19 +79,24 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
         else:
             send_ctx = get_the_one_send_context(attrs)  # async、sync 等各种模式
+        logger.info("send_ctx: {}".format(send_ctx))
         dummys = []
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
+            logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
             dummys.append(
                 self._append_send_op(main_program,
                                      send.origin_varnames(), merged_name,
                                      is_sparse, send.table_id(), ps_mode))
-
+        logger.info('ps trainer pass - ps mode: {}'.format(ps_mode))
+        logger.info('dummys: {}'.format(dummys))
         if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
-            self._append_barrier_op(main_program, dummys)
+            logger.info('insert send_barrier_op')
+            trainer_id = get_role_id(attrs['role_maker'])
+            self._append_barrier_op(main_program, dummys, trainer_id)
 
 
 @register_pass("distributed_ops_pass")
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index d649a74e4d621bbc531ce194242fbbd07b01209a..c6afd0cb03bf3f8d164082d9cbadf8dd7c08254f 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -97,7 +97,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
         logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
-        if self.ps_mode == DistributedMode.GEO:
+        if self.ps_mode != DistributedMode.SYNC:
             raise ValueError("ps mode: {} not matched {}",
                              format(ps_mode, "CpuSyncPsProgramBuilder"))
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index a8587874776bb5f5586dd23ed32c1ee810ad97c0..7743db1057dd66e7467efee0cc0253c083ff335c 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -83,8 +83,10 @@ class DistributedMode:
 
 class TrainerRuntimeConfig(object):
     def __init__(self, valid_strategy):
-
+        self.mode = None
         k_steps = valid_strategy.a_sync_configs["k_steps"]
+        logger.info("ps mode in strategy: {}, {}".format(
+            valid_strategy.a_sync, valid_strategy.a_sync_configs["k_steps"]))
         if not valid_strategy.a_sync and k_steps == 0:
             self.mode = DistributedMode.SYNC
 
@@ -94,7 +96,6 @@ class TrainerRuntimeConfig(object):
         if valid_strategy.a_sync and k_steps > 0:
             self.mode = DistributedMode.GEO
 
-        self.mode = None
         num_threads = os.getenv("CPU_NUM", "1")
 
         self.runtime_configs = {}
@@ -161,6 +162,13 @@ def get_dist_env():
     }
 
 
+def get_role_id(role_maker):
+    try:
+        return role_maker._role_id()
+    except Exception:
+        return role_maker.role_id()
+
+
 def get_ps_endpoint(role_maker):
     try:
         return role_maker._get_pserver_endpoints()[get_role_id(role_maker)]
@@ -184,7 +192,7 @@ def get_trainer_endpoint(role_maker):
 
 def get_previous_stage_trainers(role_maker):
     try:
-        return role_maker_get_previous_trainers()
+        return role_maker._get_previous_trainers()
     except Exception:
         return role_maker.get_previous_trainers()
 
@@ -229,18 +237,11 @@ def get_sparse_tablenames(program, is_distributed):
     return list(tablenames)
 
 
-def get_role_id(role_maker):
-    try:
-        return role_maker._role_id()
-    except Exception:
-        return role_maker.role_id()
-
-
 def get_ps_endpoints(role_maker):
     try:
-        return role_maker._get_pserver_endpoints()[get_role_id(role_maker)]
+        return role_maker._get_pserver_endpoints()
     except Exception:
-        return role_maker.get_pserver_endpoints()[get_role_id(role_maker)]
+        return role_maker.get_pserver_endpoints()
 
 
 def get_trainers(role_maker):
@@ -296,8 +297,35 @@ def get_geo_trainer_send_context(context):
     if context['ps_mode'] != DistributedMode.GEO:
         raise ValueError("ps mode: {} not matched {}",
                          format(ps_mode, "get_geo_trainer_send_context"))
-
     send_ctx = {}
+    trainer_id = get_role_id(context['role_maker'])
+    idx = 0
+
+    distibuted_varnames = get_sparse_tablenames(context['origin_main_program'],
+                                                True)
+    for merged in context['merged_sparse_pairs']:
+        param, grad = merged
+        grad_name = grad.merged_var.name
+        param_name = param.merged_var.name
+        is_distributed = True if param_name in distibuted_varnames else False
+
+        var = context['origin_main_program'].global_block().vars[
+            grad.merged_var.name]
+        var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+
+        sparse_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
+                                 [var_numel], [grad_name], trainer_id, True,
+                                 True, is_distributed, idx, False)
+        idx += 1
+        send_ctx[sparse_ctx.var_name()] = sparse_ctx
+
+    if len(send_ctx) == 0:
+        raise ValueError("GeoSGD require sparse parameters in your net.")
+
+    if len(context['tensor_table']) > 0 and context['is_worker']:
+        name, ctx = _step_ctx(idx, context['role_maker'])
+        send_ctx[name] = ctx
+
     return send_ctx
 
 
@@ -1253,6 +1281,60 @@ def find_op_input_output(program, block, op):
     return input_var_list, output_var_list
 
 
+def add_heter_send_op(program, heter_program, block, block_var_detail):
+    def _get_send_op_dict():
+        send_op_dict = {}
+        send_op_list = find_send_op(program)
+        for op in send_op_list:
+            input_list, _ = find_op_input_output(program,
+                                                 program.global_block(), op)
+            for var in input_list:
+                send_op_dict[var] = op
+        return send_op_dict
+
+    send_grad_var_list = []
+    send_op_dict = _get_send_op_dict()
+    table_dict = {}
+    for persistable_var in block_var_detail["backward"]["persistables"]:
+        if "@GRAD" not in persistable_var:
+            continue
+        if "GRAD" != persistable_var.split("@")[-1]:
+            continue
+        if persistable_var not in send_op_dict:
+            continue
+        send_op = send_op_dict[persistable_var]
+        is_sparse = send_op.attr('is_sparse')
+        table_id = send_op.attr('table_id')
+        send_varnames = send_op.attr('send_varnames')
+        send_grad_var_list.append(persistable_var)
+        if table_id not in table_dict:
+            table_dict[table_id] = {}
+            table_dict[table_id]['var_list'] = []
+            table_dict[table_id]['is_sparse'] = is_sparse
+            table_dict[table_id]['send_varnames'] = send_varnames
+        table_dict[table_id]['var_list'].append(persistable_var)
+
+    for table_id in table_dict:
+        dummy_output = block.create_var(
+            name=framework.generate_control_dev_var_name())
+        send_input_vars = [
+            block.vars[union_var]
+            for union_var in table_dict[table_id]['var_list']
+        ]
+        block.append_op(
+            type="send",
+            inputs={"X": send_input_vars},
+            outputs={"Out": dummy_output},
+            attrs={
+                "send_varnames": table_dict[table_id]['send_varnames'],
+                "is_sparse": is_sparse,
+                "table_id": table_id,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+    return send_grad_var_list
+
+
 def get_vars_name_in_block(block):
     vars_list = block.vars.keys()
     vars_name_list = [var_name for var_name in vars_list]
@@ -1302,10 +1384,6 @@ def create_backward_block(program, origin_program, bp_ops_list,
     return heter_block
 
 
-def debug_program(file, program, is_trainer):
-    if is_trainer:
-        with open(file, 'w+') as f:
-            f.write(str(program))
-    else:
-        with open(file, 'w+') as f:
-            f.write(str(program))
+def debug_program(file, program):
+    with open(file, 'w+') as f:
+        f.write(str(program))
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index db6faa1a1b16578b95db4d81ab5bd66e5a003f75..997075590e5cf97241188b847c0c5b5036ecee59 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace, CustomPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 26371d0d6ee7353f5660e55a6e381a177f378fd9..3bcefc41d2e781aa904f7ab581af3d72bc97b0d9 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -104,14 +104,14 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
         expected_type += (core.VarBase, )
         #  TODO(jiabin): uncomment it when we support declarative mode in eager
         # if _in_eager_mode():
-        #     expected_type += (core.eager.EagerTensor, )
+        #     expected_type += (core.eager.Tensor, )
     elif isinstance(input, core.VarBase):
         raise TypeError(
             "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
             "Because received '{}' in {} is a imperative Variable.".format(
                 input_name, op_name))
     elif hasattr(core, "eager"):
-        if isinstance(input, core.eager.EagerTensor):
+        if isinstance(input, core.eager.Tensor):
             raise TypeError(
                 "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
                 "Because received '{}' in {} is a imperative Variable.".format(
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index f4ccd033aa5fc41f67d63802bc1abdc6722adb3a..706ec0d523b938fda0501dfd04f1fc976bf6a26b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -253,7 +253,7 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         try:
             if in_dygraph_mode():
                 if _in_eager_mode():
-                    data = core.eager.read_next_eager_tensor_list(
+                    data = core.eager.read_next_tensor_list(
                         self._reader.read_next_list()[0])
                 else:
                     data = self._reader.read_next_var_list()
@@ -449,7 +449,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         while self._blocking_queue.size() >= len(self._places):
             if in_dygraph_mode():
                 if _in_eager_mode():
-                    data = core.eager.read_next_eager_tensor_list(
+                    data = core.eager.read_next_tensor_list(
                         self._reader.read_next_list()[0])
                 else:
                     self._reader.read_next_var_list()
@@ -705,7 +705,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
             if in_dygraph_mode():
                 if _in_eager_mode():
-                    data = core.eager.read_next_eager_tensor_list(
+                    data = core.eager.read_next_tensor_list(
                         self._reader.read_next_list()[0])
                 else:
                     data = self._reader.read_next_var_list()
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 9234577b8cc23a6bd2ed8986dfdcce0d21eeb3b3..8c2ff140ea4d5531a0ab6e284b1661573d9a2670 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -721,10 +721,9 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
                 value = value.astype(dtype)
 
         if _in_eager_mode():
-            return core.eager.EagerTensor(value,
-                                          framework._current_expected_place(),
-                                          False, zero_copy, name
-                                          if name else None, True)
+            return core.eager.Tensor(value,
+                                     framework._current_expected_place(), False,
+                                     zero_copy, name if name else None, True)
         else:
             py_var = core.VarBase(
                 value=value,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index f8800f3037b408b4ad6a8b33beb1282cff185f5e..dc1095849a3d8fa5de689a518934e4dea8dff99f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -707,6 +707,8 @@ class ProgramCache(object):
     def __init__(self):
         # {hash_id : (concrete_program, partial_layer)}
         self._caches = collections.OrderedDict()
+        # trace mostly recent used program 
+        self._recent_key = None
 
     def _build_once(self, cache_key):
         concrete_program = ConcreteProgram.from_func_spec(
@@ -722,6 +724,7 @@ class ProgramCache(object):
             raise ValueError('type(item) should be CacheKey, but received %s' %
                              type_name(item))
         item_id = hash(item)
+        self._recent_key = item_id
         if item_id not in self._caches:
             self._caches[item_id] = self._build_once(item)
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
@@ -749,8 +752,8 @@ class ProgramCache(object):
     def last(self):
         assert len(
             self._caches) >= 1, "No valid cached program in ProgramCache."
-        key = next(reversed(self._caches.keys()))
-        return key, self._caches[key]
+        assert self._recent_key is not None
+        return self._recent_key, self._caches[self._recent_key]
 
     def __len__(self):
         return len(self._caches)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index 368a01de81efc5dbfc2561f8a0023e0774e12f69..98e76c0f46ffc53abd84f8682b21e0c7ae204e8e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -349,7 +349,11 @@ class StaticAnalysisVisitor(object):
             ret_type = {NodeVarType.type_from_annotation(node.annotation)}
             # if annotation and value(Constant) are diffent type, we use value type
             if node.value:
-                ret_type = self.node_to_wrapper_map[node.value].node_var_type
+                node_value_type = self.node_to_wrapper_map[
+                    node.value].node_var_type
+                if not (node_value_type &
+                        {NodeVarType.UNKNOWN, NodeVarType.STATEMENT}):
+                    ret_type = node_value_type
             if isinstance(node.target, gast.Name):
                 self.node_to_wrapper_map[node.target].node_var_type = ret_type
                 self.var_env.set_var_type(node.target.id, ret_type)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 6a65b3bd9c6844c18ea49fd85ef61610cec1f7c2..53dbf1a66b27f35a75b44a0b6444cd8282c5278c 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -342,7 +342,7 @@ class Layer(object):
                 import paddle
                 import numpy as np
 
-                # the forward_post_hook change the input of the layer: input = input * 2
+                # the forward_pre_hook change the input of the layer: input = input * 2
                 def forward_pre_hook(layer, input):
                     # user can use layer and input for information statistis tasks
 
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 64c418fabb11f6a82ca328aa74ac540480477fba..d93791a1f083a56f2f9f7b8d1c09e675c490e9e8 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -222,7 +222,7 @@ def monkey_patch_math_varbase():
             # 2. create varbase for scalar
             lhs_dtype = self.dtype
             if _in_eager_mode():
-                other_var_should_be = core.eager.EagerTensor
+                other_var_should_be = core.eager.Tensor
             else:
                 other_var_should_be = core.VarBase
             if not isinstance(other_var, other_var_should_be):
@@ -343,7 +343,7 @@ def monkey_patch_math_varbase():
     if core._in_eager_mode():
         local_already_patch = _already_patch_eager_tensor
         _already_patch_eager_tensor = True
-        local_tensor = core.eager.EagerTensor
+        local_tensor = core.eager.Tensor
     else:
         local_already_patch = _already_patch_varbase
         _already_patch_varbase = True
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 418b80c6ee81620ac0beb94839f869e3334626f5..e1857a34f03f514e04e83e9596c9826569e2a90d 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -2233,6 +2233,19 @@ class NCE(layers.Layer):
         self._inputs['Weight'] = self.weight
 
     def forward(self, input, label, sample_weight=None):
+        if in_dygraph_mode():
+            attrs = ('num_total_classes', self._attrs['num_total_classes'],
+                     'num_neg_samples', self._attrs['num_neg_samples'], 'seed',
+                     self._attrs['seed'], 'sampler', self._attrs['sampler'],
+                     'is_sparse', self._attrs['is_sparse'], 'remote_prefetch',
+                     self._attrs['remote_prefetch'])
+            cost, _, _ = _C_ops.nce(
+                input, label, self.weight, self.bias,
+                self._inputs['SampleWeight'], self._inputs['CustomDistProbs'],
+                self._inputs['CustomDistAlias'],
+                self._inputs['CustomDistAliasProbs'], *attrs)
+            return cost / (self._num_neg_samples + 1)
+
         check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE")
         check_variable_and_dtype(label, "label", ['int64'], "NCE")
         check_type(sample_weight, 'sample_weight', (Variable, type(None)),
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index f5d569828775e6bcc90ffecb3d820696bf0e56c0..6f0305f4774d6429951ee69a5b3a9db1bed18131 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -150,7 +150,7 @@ def monkey_patch_varbase():
 
         """
         if core._in_eager_mode():
-            base_tensor = core.eager.EagerTensor
+            base_tensor = core.eager.Tensor
         else:
             base_tensor = core.VarBase
         assert isinstance(value, (np.ndarray, base_tensor, dict, str)), \
@@ -180,9 +180,9 @@ def monkey_patch_varbase():
                 "Variable dtype not match, Variable [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
                     self.name, self_tensor_np.dtype, value_np.dtype)
 
-            # NOTE(wuweilong): self could be VarBase or EagerTensor, the subsequent behavior are defined in different files
+            # NOTE(wuweilong): self could be VarBase or Tensor, the subsequent behavior are defined in different files
             # if self is VarBase, method value() return Variable that bindded in imperative.cc, get_tensor() bindded in pybind.cc
-            # if self is EagerTensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc
+            # if self is Tensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc
             # this Interface behavior will be unifed in the future.
             self.value().get_tensor().set(value_np,
                                           framework._current_expected_place())
@@ -244,8 +244,8 @@ def monkey_patch_varbase():
             if grad_tensor is not None:
                 if core._in_eager_mode():
                     assert isinstance(
-                        grad_tensor, core.eager.EagerTensor
-                    ), "The type of grad_tensor must be paddle.Tensor"
+                        grad_tensor, core.eager.
+                        Tensor), "The type of grad_tensor must be paddle.Tensor"
                 else:
                     assert isinstance(
                         grad_tensor, paddle.
@@ -592,8 +592,8 @@ def monkey_patch_varbase():
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
         if core._in_eager_mode():
-            from paddle.tensor.to_string import eager_tensor_to_string
-            return eager_tensor_to_string(self)
+            from paddle.tensor.to_string import tensor_to_string
+            return tensor_to_string(self)
         else:
             from paddle.tensor.to_string import to_string
             return to_string(self)
@@ -624,7 +624,7 @@ def monkey_patch_varbase():
                 "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
             )
         if core._in_eager_mode():
-            new_varbase = core.eager.EagerTensor()
+            new_varbase = core.eager.Tensor()
         else:
             new_varbase = core.VarBase()
         new_varbase.name = self.name + unique_name.generate("_deepcopy")
@@ -808,16 +808,16 @@ def monkey_patch_varbase():
         ("__getitem__", __getitem__), ("item", item),
         ("__setitem__", __setitem__), ("_to", _to)):
         if core._in_eager_mode():
-            setattr(core.eager.EagerTensor, method_name, method)
+            setattr(core.eager.Tensor, method_name, method)
         else:
             setattr(core.VarBase, method_name, method)
 
     if core._in_eager_mode():
-        setattr(core.eager.EagerTensor, "_grad_ivar", _grad_ivar)
-        setattr(core.eager.EagerTensor, "_set_grad_ivar", _set_grad_ivar)
-        setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
-        setattr(core.eager.EagerTensor, "clone", clone)
-        setattr(core.eager.EagerTensor, "value", value)
+        setattr(core.eager.Tensor, "_grad_ivar", _grad_ivar)
+        setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
+        setattr(core.eager.Tensor, "clear_gradient", clear_gradient)
+        setattr(core.eager.Tensor, "clone", clone)
+        setattr(core.eager.Tensor, "value", value)
     else:
         setattr(core.VarBase, "__name__", "Tensor")
         setattr(core.VarBase, "grad", grad)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bb77f6031f7f99f85925cc805ee9b8ae57fc17df..780b8acc4fde67f4b47589869b258dd99a022125 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1057,7 +1057,7 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
             dtype = convert_np_dtype_to_dtype_(dtype)
 
     if _in_eager_mode():
-        eager_tensor = core.eager.EagerTensor(
+        eager_tensor = core.eager.Tensor(
             dtype if dtype else core.VarDesc.VarType.FP32,
             list(shape) if shape else [], name, type
             if type else core.VarDesc.VarType.LOD_TENSOR, True
@@ -1076,7 +1076,7 @@ class VariableMetaClass(type):
         t = type(instance)
         if in_dygraph_mode():
             if _in_eager_mode():
-                return issubclass(t, core.eager.EagerTensor)
+                return issubclass(t, core.eager.Tensor)
             return issubclass(t, core.VarBase)
         else:
             return issubclass(t, Variable)
@@ -6412,7 +6412,7 @@ class ParamBase(core.VarBase):
 
 
 if hasattr(core, "eager"):
-    _core_eager_eagertensor = core.eager.EagerTensor
+    _core_eager_eagertensor = core.eager.Tensor
 else:
     _core_eager_eagertensor = object
 
@@ -6918,7 +6918,7 @@ def _get_paddle_place(place):
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
                           core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
-                          core.IPUPlace, core.MLUPlace)):
+                          core.IPUPlace, core.MLUPlace, core.CustomPlace)):
         return place
 
     if not isinstance(place, str):
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 67fcd901dedc964eedad2e1720a44cfa01037574..9f54a3547d39547e3d5540981d05d862573ea214 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -85,10 +85,9 @@ class LayerHelperBase(object):
             assert in_dygraph_mode(
             ), "to_variable could only be called in dygraph mode"
             if _in_eager_mode():
-                return core.eager.EagerTensor(value,
-                                              _current_expected_place(), False,
-                                              False, name
-                                              if name else None, True)
+                return core.eager.Tensor(value,
+                                         _current_expected_place(), False,
+                                         False, name if name else None, True)
             else:
                 py_var = core.VarBase(
                     value=value,
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index dde39b2dfdb6866df3bd92bba5f0c223c0a1a243..727ceca72d1f1cfc0c34dae4e516568052136ba4 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -972,7 +972,7 @@ class DygraphGeneratorLoader(DataLoaderBase):
     def __next__(self):
         try:
             if _in_eager_mode():
-                return core.eager.read_next_eager_tensor_list(
+                return core.eager.read_next_tensor_list(
                     self._reader.read_next_list()[0])
             else:
                 return self._reader.read_next_var_list()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index f28e99fc00d97ae13689be208bd3b10727f053ef..b186869ee9747fdc2b5c51ecc5051ab6f93f3706 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -28,7 +28,7 @@ from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
 class TestPsTrainerPass(PsPassTestBase):
     def init(self):
         self.config = {}
-        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
+        self.config['ps_mode_config'] = ""
         self.config['worker_num'] = "1"
         self.config['server_num'] = "1"
         self.config['run_minimize'] = "0"
@@ -47,23 +47,58 @@ class TestPsTrainerPass(PsPassTestBase):
     def check(self):
         pass
 
-    def test_ps_optimizer_minimize_cpu(self):
+    def test_ps_optimizer_minimize_cpu_async(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
+        self.config['run_minimize'] = '1'
+
+        self.config['debug_new_minimize'] = '0'
+        self.config['log_dir'] = "/async_cpu_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch(self.config)
+
+        self.config['debug_new_minimize'] = '1'
+        self.config['log_dir'] = "/async_cpu_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch(self.config)
+
+        self.check()
+
+    def test_ps_optimizer_minimize_cpu_sync(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_sync_ps_config.yaml"
+        self.config['run_minimize'] = '1'
+
+        self.config['debug_new_minimize'] = '0'
+        self.config['log_dir'] = "/sync_cpu_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch(self.config)
+
+        self.config['debug_new_minimize'] = '1'
+        self.config['log_dir'] = "/sync_cpu_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch(self.config)
+
+        self.check()
+
+    def test_ps_optimizer_minimize_cpu_geo(self):
         self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml"
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/cpu_log_old_minimize"
+        self.config['log_dir'] = "/geo_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
         self.ps_launch(self.config)
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/cpu_log_new_minimize"
+        self.config['log_dir'] = "/geo_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
         self.ps_launch(self.config)
 
         self.check()
 
-    # heter ps 三阶段待测
+    # heter ps 二阶段
     def test_ps_optimizer_minimize_heter(self):
         self.init()
         self.config['worker_num'] = "2"
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index d18c691325094e10dc181ad7778a6ba1ab81a57f..67091f5fabb2ede1b589ba863c86b86607514dbb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -214,6 +214,7 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
             self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
             self.assertTrue(len(foo.program_cache) == 1)
             self.assertTrue(len(foo.program_cache.concrete_programs()) == 1)
+            first_program = foo.program_cache.last()
 
             # [16, 10] + [10] (numpy)
             out_2 = foo(to_variable(x_data), y_data)
@@ -232,6 +233,11 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
             # create a new program
             self.assertTrue(len(foo.program_cache) == 2)
 
+            # test for recent program
+            foo(to_variable(x_data), y_data)
+            recent_program = foo.program_cache.last()
+            self.assertTrue(first_program == recent_program)
+
     def test_get_concrete_program(self):
 
         foo = declarative(foo_func)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 57386bd00c9f39a9c00c6f24b79cc226bf6e27dd..567f266cd57b1eb4d16602b9bf7e1ee95d56bf19 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -306,5 +306,35 @@ class TestListInForLoopWithSubscript(TestListWithoutControlFlow):
         self.input = np.random.random((3, 4)).astype('float32')
 
 
+class ListWithCondNet(paddle.nn.Layer):
+    def __init__(self):
+        super(ListWithCondNet, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x, index):
+        y = paddle.nn.functional.relu(x)
+        a = []
+
+        for i in y:
+            a.append(i)
+
+        if index > 0:
+            res = a[0] * a[0]
+        else:
+            res = a[-1] * a[-1]
+
+        z = a[-1] * res
+        return z
+
+
+class TestListWithCondGradInferVarType(unittest.TestCase):
+    def test_to_static(self):
+        net = ListWithCondNet()
+        x = paddle.to_tensor([2, 3, 4], dtype='float32')
+        index = paddle.to_tensor([1])
+        res = net(x, index)
+        self.assertEqual(res[0], 16.)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index eb545e5ca26add0be3f61a6025833ddc8b376012..388291a51c22f4fa52fda5b99f30fb879df93447 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -147,6 +147,7 @@ result_var_type6 = {
 def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float='diff'):
     a = True
     e, f = paddle.shape(c)
+    g: paddle.Tensor = len(c)
 
 
 result_var_type7 = {
@@ -155,7 +156,8 @@ result_var_type7 = {
     'c': {NodeVarType.TENSOR},
     'd': {NodeVarType.STRING},
     'e': {NodeVarType.PADDLE_RETURN_TYPES},
-    'f': {NodeVarType.PADDLE_RETURN_TYPES}
+    'f': {NodeVarType.PADDLE_RETURN_TYPES},
+    'g': {NodeVarType.TENSOR}
 }
 
 test_funcs = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
index 838678b1c8449b6136dda00dcb3a70c03b3e9c16..e79b33d88d3f18a180b0e376131ce62e56726e4a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
@@ -98,10 +98,20 @@ class TrtConvertGeluTest(TrtLayerAutoScanTest):
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if attrs[0]['approximate'] == True or self.dims == 1:
+            valid_version = (7, 0, 0)
+            compile_version = paddle_infer.get_trt_compile_version()
+            runtime_version = paddle_infer.get_trt_runtime_version()
+            self.assertTrue(compile_version == runtime_version)
+            # Dimension one only runs on Paddle OP
+            if self.dims == 1:
                 return 0, 3
-            else:
+            if compile_version >= valid_version:
                 return 1, 2
+            else:
+                if attrs[0]['approximate'] == True:
+                    return 0, 3
+                else:
+                    return 1, 2
 
         attrs = [
             program_config.ops[i].attrs
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index ddb96c37db780c214d40e8afca7348cff935ce6c..89ce1145d74e01c32e155495fcb4212bed78ab84 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -52,7 +52,7 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
 
         for strides in [[1, 1], [1, 2], [2, 2]]:
-            for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]:
+            for paddings in [[0, 2], [0, 3]]:
                 for pooling_type in ['max', 'avg']:
                     for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']:
                         for ksize in [[2, 3], [3, 3]]:
@@ -145,44 +145,18 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest):
                                                                      True), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs['paddings']) == 4:
-                return True
-            return False
-
-        self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-                           "4-dims paddings are not support for trt now.")
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['global_pooling'] == True:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "It is not support that global_pooling is true for trt now.")
-
-        def teller3(program_config, predictor_config):
-            if self.dynamic_shape.min_input_shape == {} and program_config.ops[
-                    0].attrs['ceil_mode'] == True:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "It is not support that ceil_mode is true in static mode for trt now."
-        )
-
-        def teller4(program_config, predictor_config):
-            if self.dynamic_shape.min_input_shape != {} and (
-                    program_config.ops[0].attrs['strides'] == [1, 2] or
-                    program_config.ops[0].attrs['strides'] == [2, 2]):
+        def teller(program_config, predictor_config):
+            if program_config.ops[0].attrs['pooling_type'] == 'avg' and \
+               program_config.ops[0].attrs['global_pooling'] == False and \
+               program_config.ops[0].attrs['exclusive'] == True and \
+               program_config.ops[0].attrs['adaptive'] == False and \
+               program_config.ops[0].attrs['ceil_mode'] == True:
                 return True
             return False
 
         self.add_skip_case(
-            teller4, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "It is not support that strides is not equal [1, 1] in dynamic mode for trt now."
+            teller, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The results of some cases are Nan, but the results of TensorRT and GPU are the same."
         )
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
index 85bd625413c86d995cdfb515c49c3a87af237a6c..1bcbbc38c9762cb19b9b8b01ac8e1728b11d38e0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
@@ -28,25 +28,14 @@ class TRTGroupNormTest(InferencePassTest):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
                 name="data", shape=[-1, 512, 12, 12], dtype="float32")
-            relu_out = fluid.layers.relu(data)
-            relu6_out = fluid.layers.relu6(relu_out)
-            tanh_out = fluid.layers.tanh(relu6_out)
-            conv_out = fluid.layers.conv2d(
-                input=tanh_out,
-                num_filters=512,
-                filter_size=3,
-                groups=1,
-                padding=[1, 1],
-                bias_attr=False,
-                act=None)
-            out = self.append_group_norm(conv_out)
+            out = self.append_group_norm(data)
 
         self.feeds = {
             "data": np.random.random([1, 512, 12, 12]).astype("float32"),
         }
         self.enable_trt = True
         self.trt_parameters = TRTGroupNormTest.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+            1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = TRTGroupNormTest.DynamicShapeParam({
             'data': [1, 512, 12, 12]
         }, {'data': [1, 512, 12, 12]}, {'data': [1, 512, 12, 12]}, False)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index 26ad45db7a18d6b51150ebe988bd47473fd01c40..d71937f986e515bfffb0713fe11e8547a648628d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -119,6 +119,17 @@ class TensorRTAvgPoolTest(TensorRTPoolTest):
         self.exclusive = False
 
 
+class TensorRTAvgCeilPoolTest(TensorRTPoolTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'avg'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = True
+        self.exclusive = False
+
+
 class TensorRTGlobalPoolTest(TensorRTPoolTest):
     def set_extra_config(self):
         self.pool_size = 2
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
index 2d9703117671cb4d994923695a6944061bf99838..fd442c6205e98d26b4797ff2ef4499b376bc8bdd 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
@@ -25,7 +25,125 @@ from paddle.fluid import Program, program_guard
 import sys
 sys.path.append('..')
 from op_test import OpTest
-from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
+from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
+
+
+def pool2d_backward_navie(x,
+                          ksize,
+                          strides,
+                          paddings,
+                          global_pool=0,
+                          ceil_mode=False,
+                          exclusive=True,
+                          adaptive=False,
+                          data_format='NCHW',
+                          pool_type="max",
+                          padding_algorithm="EXPLICIT"):
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCHW":
+                input_data_shape = x.shape[2:4]
+            elif data_format == "NHWC":
+                input_data_shape = x.shape[1:3]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+
+    assert len(paddings) == 2 or len(paddings) == 4
+    is_sys = True if len(paddings) == 2 else False
+
+    if data_format == "NHWC":
+        x = x.transpose([0, 3, 1, 2])
+
+    N, C, H, W = x.shape
+
+    if global_pool == 1:
+        ksize = [H, W]
+        paddings = [0 for _ in range(len(paddings))]
+
+    pad_h_up = paddings[0] if is_sys else paddings[0]
+    pad_h_down = paddings[0] if is_sys else paddings[1]
+    pad_w_left = paddings[1] if is_sys else paddings[2]
+    pad_w_right = paddings[1] if is_sys else paddings[3]
+
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1
+        W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1
+
+    x_grad = np.zeros_like(x)
+    for i in range(H_out):
+        if adaptive:
+            in_h_start = adaptive_start_index(i, H, ksize[0])
+            in_h_end = adaptive_end_index(i, H, ksize[0])
+        else:
+            in_h_start = np.max((i * strides[0] - pad_h_up, 0))
+            in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H))
+
+        for j in range(W_out):
+            if adaptive:
+                in_w_start = adaptive_start_index(j, W, ksize[1])
+                in_w_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
+
+            if pool_type == 'avg':
+                if (exclusive or adaptive):
+                    field_size = (in_h_end - in_h_start) * (
+                        in_w_end - in_w_start)
+                x_grad[:, :, in_h_start:in_h_end, in_w_start:
+                       in_w_end] += 1 / field_size
+            elif pool_type == 'max':
+                for n in range(N):
+                    for c in range(C):
+                        idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start:
+                                          in_w_end].flatten())
+                        idx_h = idx // (in_w_end - in_w_start)
+                        idx_w = idx % (in_w_end - in_w_start)
+                        x_grad[n, c, in_h_start + idx_h, in_w_start +
+                               idx_w] += 1
+
+    if data_format == "NHWC":
+        x_grad = x_grad.transpose([0, 2, 3, 1])
+    return x_grad
 
 
 class TestPool2D_Op_Mixin(object):
@@ -71,12 +189,25 @@ class TestPool2D_Op_Mixin(object):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
-        if self.pool_type != "max":
-            self.check_grad_with_place(
-                self.place, set(['X']), 'Out', max_relative_error=0.07)
+        x_grad = pool2d_backward_navie(
+            self.inputs["X"],
+            ksize=self.ksize,
+            strides=self.strides,
+            paddings=self.paddings,
+            global_pool=self.global_pool,
+            ceil_mode=False,
+            exclusive=self.exclusive,
+            adaptive=self.adaptive,
+            data_format=self.data_format,
+            pool_type=self.pool_type,
+            padding_algorithm=self.padding_algorithm)
+        x_grad = x_grad / np.prod(self.outputs['Out'].shape)
+        self.check_grad_with_place(
+            self.place,
+            set(['X']),
+            'Out',
+            max_relative_error=0.06,
+            user_defined_grads=[x_grad])
 
     def init_data_format(self):
         self.data_format = "NCHW"
@@ -108,7 +239,6 @@ class TestPool2D_Op_Mixin(object):
     def init_exclusive(self):
         self.exclusive = True
 
-    # Not support adaptive pooling currently
     def init_adaptive(self):
         self.adaptive = False
 
@@ -173,7 +303,7 @@ class TestCase5(TestCase2):
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
-def create_test_fp16_class(parent, check_grad=True):
+def create_test_fp16_class(parent):
     class TestFp16Case(parent):
         def init_data_type(self):
             self.dtype = np.float16
@@ -182,19 +312,13 @@ def create_test_fp16_class(parent, check_grad=True):
             place = core.MLUPlace(0)
             self.check_output_with_place(place, atol=1e-3)
 
-        def test_check_grad(self):
-            place = core.MLUPlace(0)
-            if self.pool_type != "max" and check_grad:
-                self.check_grad_with_place(
-                    place, set(['X']), 'Out', max_relative_error=0.07)
-
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
     TestFp16Case.__name__ = cls_name
     globals()[cls_name] = TestFp16Case
 
 
 create_test_fp16_class(TestPool2D_Op)
-create_test_fp16_class(TestCase1, check_grad=False)
+create_test_fp16_class(TestCase1)
 create_test_fp16_class(TestCase2)
 create_test_fp16_class(TestCase3)
 create_test_fp16_class(TestCase4)
@@ -222,6 +346,24 @@ class TestAvgInclude(TestCase2):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 6, 6]
+
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+
+
 #-------test pool2d with asymmetric padding-----
 
 
@@ -302,6 +444,19 @@ class TestAvgInclude_AsyPadding(TestCase2):
         self.shape = [2, 3, 7, 7]
 
 
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1, 0, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
 #----------- test channel_last --------------
 class TestPool2D_channel_last(TestPool2D_Op):
     def init_data_format(self):
@@ -359,14 +514,6 @@ class TestCase5_Max(TestCase2):
     def init_pool_type(self):
         self.pool_type = "max"
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        place = core.MLUPlace(0)
-        if self.pool_type == "max":
-            self.check_grad_with_place(
-                place, set(['X']), 'Out', max_relative_error=1.00)
-
 
 class TestCase5_channel_last_Max(TestCase5_Max):
     def init_data_format(self):
@@ -381,6 +528,11 @@ class TestAvgInclude_channel_last(TestCase2_channel_last):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
index 882043ef6eb911f6163d516e9929658f38810ade..23ca0cf1f492fade05a81f0de1d6bc262458675c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -71,7 +71,7 @@ class TestMatMulV2Op(OpTest):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -83,7 +83,7 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -95,7 +95,7 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -107,7 +107,7 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -119,7 +119,7 @@ class TestMatMuklOp5(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -131,7 +131,7 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -143,7 +143,7 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -155,7 +155,7 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -167,7 +167,7 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -179,7 +179,7 @@ class TestMatMuklOp10(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -191,7 +191,7 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -203,7 +203,7 @@ class TestMatMuklOp12(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -215,7 +215,7 @@ class TestMatMuklOp13(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -227,7 +227,7 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -239,7 +239,7 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the gradient for special case
     """
@@ -251,7 +251,7 @@ class TestMatMuklOp16(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -263,7 +263,7 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+class TestMatMulOpBroadcast1(TestMatMulV2Op):
     """
     case 14_3
     """
@@ -275,7 +275,7 @@ class TestMatMuklOpBroadcast1(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+class TestMatMulOpBroadcast2(TestMatMulV2Op):
     """
     case 14_4
     """
@@ -310,22 +310,22 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
 
 
 create_test_fp16_class(TestMatMulV2Op)
-create_test_fp16_class(TestMatMuklOp2)
-create_test_fp16_class(TestMatMuklOp3)
-create_test_fp16_class(TestMatMuklOp4)
-create_test_fp16_class(TestMatMuklOp5)
-create_test_fp16_class(TestMatMuklOp6)
-create_test_fp16_class(TestMatMuklOp7)
-create_test_fp16_class(TestMatMuklOp8)
-create_test_fp16_class(TestMatMuklOp9)
-create_test_fp16_class(TestMatMuklOp10)
-create_test_fp16_class(TestMatMuklOp11)
-create_test_fp16_class(TestMatMuklOp12)
-create_test_fp16_class(TestMatMuklOp13)
-create_test_fp16_class(TestMatMuklOp14)
-create_test_fp16_class(TestMatMuklOp15)
-create_test_fp16_class(TestMatMuklOp16)
-create_test_fp16_class(TestMatMuklOp17)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp7)
+create_test_fp16_class(TestMatMulOp8)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+create_test_fp16_class(TestMatMulOp14)
+create_test_fp16_class(TestMatMulOp15)
+create_test_fp16_class(TestMatMulOp16)
+create_test_fp16_class(TestMatMulOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 754d7bd54b9f817d73c2f5d705026c9a468f4008..85423df3d382831738c2c64ea845d0661f9cdbb7 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1658,7 +1658,7 @@ class OpTest(unittest.TestCase):
         for grad in analytic_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
-                max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error
+                max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error
             fp32_analytic_grads.append(grad)
         analytic_grads = fp32_analytic_grads
 
@@ -1666,7 +1666,7 @@ class OpTest(unittest.TestCase):
         for grad in numeric_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
-                max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error
+                max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error
             fp32_numeric_grads.append(grad)
         numeric_grads = fp32_numeric_grads
 
diff --git a/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml b/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml
index 669709ea5607e5def58204637202e839090f1197..93a13a67ce6b519fa3c1b544f05ff59072a9f77d 100755
--- a/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml
+++ b/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml
@@ -26,7 +26,6 @@ hyper_parameters:
   fc_sizes: [400, 400, 400]
 
 runner:
-  geo_step: 400
   sync_mode: "async"  # sync / async / geo / heter
   thread_num: 16
   use_gpu: 0
diff --git a/paddle/scripts/get_pten_kernel_function.sh b/python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml
old mode 100755
new mode 100644
similarity index 51%
rename from paddle/scripts/get_pten_kernel_function.sh
rename to python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml
index 6ae2f1b679e3eafcef5c20376ecd82784d61d4e0..80125ae6c37faa469469f5f67bd9b8796fd079f2
--- a/paddle/scripts/get_pten_kernel_function.sh
+++ b/python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,15 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#=================================================
-#                   Utils
-#=================================================
-
-set -e
+# refer to PaddleRec/models/rank/dnn/benchmark.yaml
 
-EXIT_CODE=0;
-tmp_dir=`mktemp -d`
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.0001
+    adam_lazy_mode: True
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_input_dim: 13
+  fc_sizes: [400, 400, 400]
 
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+runner:
+  geo_step: 400
+  sync_mode: "geo"
+  thread_num: 16
+  use_gpu: 0
+  
+  model_path: "../ps_dnn_model.py"
 
-unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/pten/kernels -name "*.c*" | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' | grep PT_REGISTER | awk -F ",|\(" '{gsub(/ /,"");print $2, $3, $4, $5}' |  sort -u | awk '{gsub(/pten::/,"");print $0}' | grep -v "_grad"
+  
diff --git a/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml b/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95685a488cade1219290956dab8339ee641e001f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to PaddleRec/models/rank/dnn/benchmark.yaml
+
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.0001
+    adam_lazy_mode: True
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_input_dim: 13
+  fc_sizes: [400, 400, 400]
+
+runner:
+  sync_mode: "sync"
+  thread_num: 16
+  use_gpu: 0
+  
+  model_path: "../ps_dnn_model.py"
+
+  
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index 8f8ff65af544a1c4ddb4f1548603b418d3bf8bed..d08c1d41c89ec532f6c3124000f9bec38f9b86d7 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -329,9 +329,9 @@ class DnnTrainer(object):
         sync_mode = self.config.get("runner.sync_mode")
         inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
 
+        self.role_maker._generate_role()  # 必要
         if self.config['debug_new_minimize'] == 1:
             logger.info("entering run_minimize -- new")
-            self.role_maker._generate_role()  # 必要
             from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
@@ -346,11 +346,16 @@ class DnnTrainer(object):
         if fleet.is_server():
             _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_server_main.prototxt'
-            debug_program(_main_file, loss.block.program, 0)
+            debug_program(_main_file, loss.block.program)
         elif fleet.is_worker():
             _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_worker_main.prototxt'
-            debug_program(_main_file, loss.block.program, 1)
+            debug_program(_main_file, loss.block.program)
+        elif self.role_maker._is_heter_worker():
+            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+                self.config[
+                    'debug_new_minimize']) + '_heter_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
 
     def run_single_pass(self):
         self.init_fleet_with_gloo()
@@ -395,17 +400,18 @@ class DnnTrainer(object):
             _main_file = '/' + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_server_main.prototxt'
-            debug_program(_main_file, _main, 0)
+            debug_program(_main_file, _main)
         elif fleet.is_worker():
             _main_file = '/' + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_worker_main.prototxt'
-            debug_program(_main_file, _main, 1)
+            debug_program(_main_file, _main)
 
 
 if __name__ == "__main__":
     paddle.enable_static()
     config = parse_args()
+    logger.info(">>>>>>>>>> python process started")
     os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
     benchmark_main = DnnTrainer(config)
     if config['run_single_pass'] == 1:
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index d6bf768bee7744524d33082b2cda81ea4870e534..252482fa6d270edbc1bec3a0d6023933521d7f7e 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -109,26 +109,26 @@ class EagerDtypeTestCase(unittest.TestCase):
                                         core.VarDesc.VarType.COMPLEX128)
 
 
-class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
+class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
     def constructor(self, place):
-        egr_tensor = core.eager.EagerTensor()
+        egr_tensor = core.eager.Tensor()
         self.assertEqual(egr_tensor.persistable, False)
         self.assertTrue("generated" in egr_tensor.name)
         self.assertEqual(egr_tensor.shape, [])
         self.assertEqual(egr_tensor.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor.stop_gradient, True)
 
-        egr_tensor0 = core.eager.EagerTensor(
-            core.VarDesc.VarType.FP32, [4, 16, 16, 32], "test_eager_tensor",
-            core.VarDesc.VarType.LOD_TENSOR, True)
+        egr_tensor0 = core.eager.Tensor(core.VarDesc.VarType.FP32,
+                                        [4, 16, 16, 32], "test_eager_tensor",
+                                        core.VarDesc.VarType.LOD_TENSOR, True)
         self.assertEqual(egr_tensor0.persistable, True)
         self.assertEqual(egr_tensor0.name, "test_eager_tensor")
         self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
         self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP32)
 
         arr0 = np.random.rand(4, 16, 16, 32).astype('float32')
-        egr_tensor1 = core.eager.EagerTensor(arr0, place, True, False,
-                                             "numpy_tensor1", False)
+        egr_tensor1 = core.eager.Tensor(arr0, place, True, False,
+                                        "numpy_tensor1", False)
         self.assertEqual(egr_tensor1.persistable, True)
         self.assertEqual(egr_tensor1.name, "numpy_tensor1")
         self.assertEqual(egr_tensor1.shape, [4, 16, 16, 32])
@@ -138,8 +138,8 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(np.array_equal(egr_tensor1.numpy(), arr0))
 
         arr1 = np.random.randint(100, size=(4, 16, 16, 32), dtype=np.int64)
-        egr_tensor2 = core.eager.EagerTensor(arr1, place, False, True,
-                                             "numpy_tensor2", True)
+        egr_tensor2 = core.eager.Tensor(arr1, place, False, True,
+                                        "numpy_tensor2", True)
         self.assertEqual(egr_tensor2.persistable, False)
         self.assertEqual(egr_tensor2.name, "numpy_tensor2")
         self.assertEqual(egr_tensor2.shape, [4, 16, 16, 32])
@@ -149,7 +149,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(np.array_equal(egr_tensor2.numpy(), arr1))
 
         arr2 = np.random.rand(4, 16, 16, 32, 64).astype('float32')
-        egr_tensor3 = core.eager.EagerTensor(arr2)
+        egr_tensor3 = core.eager.Tensor(arr2)
         self.assertEqual(egr_tensor3.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor3.name)
         self.assertEqual(egr_tensor3.shape, [4, 16, 16, 32, 64])
@@ -161,7 +161,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(np.array_equal(egr_tensor3.numpy(), arr2))
 
         egr_tensor3.stop_gradient = False
-        egr_tensor4 = core.eager.EagerTensor(egr_tensor3)
+        egr_tensor4 = core.eager.Tensor(egr_tensor3)
         self.assertEqual(egr_tensor4.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor4.name)
         self.assertEqual(egr_tensor4.shape, egr_tensor3.shape)
@@ -174,7 +174,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
             np.array_equal(egr_tensor4.numpy(), egr_tensor3.numpy()))
 
         arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
-        egr_tensor5 = core.eager.EagerTensor(arr4, place)
+        egr_tensor5 = core.eager.Tensor(arr4, place)
         self.assertEqual(egr_tensor5.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor5.name)
         self.assertEqual(egr_tensor5.shape, [4, 16, 16, 32])
@@ -183,7 +183,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor5.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor5.numpy(), arr4))
 
-        egr_tensor6 = core.eager.EagerTensor(egr_tensor5, core.CPUPlace())
+        egr_tensor6 = core.eager.Tensor(egr_tensor5, core.CPUPlace())
         self.assertEqual(egr_tensor6.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor6.name)
         self.assertEqual(egr_tensor6.shape, [4, 16, 16, 32])
@@ -193,7 +193,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor6.numpy(), egr_tensor5.numpy()))
 
-        egr_tensor7 = core.eager.EagerTensor(arr4, place, True)
+        egr_tensor7 = core.eager.Tensor(arr4, place, True)
         self.assertEqual(egr_tensor7.persistable, True)
         self.assertTrue("generated_tensor" in egr_tensor7.name)
         self.assertEqual(egr_tensor7.shape, [4, 16, 16, 32])
@@ -202,7 +202,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor7.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor7.numpy(), arr4))
 
-        egr_tensor8 = core.eager.EagerTensor(egr_tensor6, place, "egr_tensor8")
+        egr_tensor8 = core.eager.Tensor(egr_tensor6, place, "egr_tensor8")
         self.assertEqual(egr_tensor8.persistable, False)
         self.assertEqual(egr_tensor8.name, "egr_tensor8")
         self.assertEqual(egr_tensor8.shape, [4, 16, 16, 32])
@@ -212,7 +212,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor8.numpy(), egr_tensor5.numpy()))
 
-        egr_tensor9 = core.eager.EagerTensor(arr4, place, True, True)
+        egr_tensor9 = core.eager.Tensor(arr4, place, True, True)
         self.assertEqual(egr_tensor9.persistable, True)
         self.assertTrue("generated_tensor" in egr_tensor9.name)
         self.assertEqual(egr_tensor9.shape, [4, 16, 16, 32])
@@ -224,7 +224,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         x = np.random.rand(3, 3).astype('float32')
         t = paddle.fluid.Tensor()
         t.set(x, paddle.fluid.CPUPlace())
-        egr_tensor10 = core.eager.EagerTensor(t, place)
+        egr_tensor10 = core.eager.Tensor(t, place)
         self.assertEqual(egr_tensor10.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor10.name)
         self.assertEqual(egr_tensor10.shape, [3, 3])
@@ -233,7 +233,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor10.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor10.numpy(), x))
 
-        egr_tensor11 = core.eager.EagerTensor(t, place, "framework_constructed")
+        egr_tensor11 = core.eager.Tensor(t, place, "framework_constructed")
         self.assertEqual(egr_tensor11.persistable, False)
         self.assertTrue("framework_constructed" in egr_tensor11.name)
         self.assertEqual(egr_tensor11.shape, [3, 3])
@@ -242,7 +242,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor11.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor11.numpy(), x))
 
-        egr_tensor12 = core.eager.EagerTensor(t)
+        egr_tensor12 = core.eager.Tensor(t)
         self.assertEqual(egr_tensor12.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor12.name)
         self.assertEqual(egr_tensor12.shape, [3, 3])
@@ -290,10 +290,10 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
                 self.constructor(p)
 
     def constructor_with_kwargs(self, place):
-        # init EagerTensor by Python array
+        # init Tensor by Python array
         arr = np.random.rand(4, 16, 16, 32).astype('float32')
 
-        egr_tensor0 = core.eager.EagerTensor(value=arr)
+        egr_tensor0 = core.eager.Tensor(value=arr)
         self.assertEqual(egr_tensor0.persistable, False)
         self.assertTrue("generated" in egr_tensor0.name)
         self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
@@ -303,7 +303,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor0.stop_gradient, True)
 
-        egr_tensor1 = core.eager.EagerTensor(value=arr, place=place)
+        egr_tensor1 = core.eager.Tensor(value=arr, place=place)
         self.assertEqual(egr_tensor1.persistable, False)
         self.assertTrue("generated" in egr_tensor1.name)
         self.assertEqual(egr_tensor1.shape, [4, 16, 16, 32])
@@ -311,7 +311,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor1.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor1.stop_gradient, True)
 
-        egr_tensor2 = core.eager.EagerTensor(arr, place=place)
+        egr_tensor2 = core.eager.Tensor(arr, place=place)
         self.assertEqual(egr_tensor2.persistable, False)
         self.assertTrue("generated" in egr_tensor2.name)
         self.assertEqual(egr_tensor2.shape, [4, 16, 16, 32])
@@ -319,7 +319,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor2.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor2.stop_gradient, True)
 
-        egr_tensor3 = core.eager.EagerTensor(
+        egr_tensor3 = core.eager.Tensor(
             arr, place=place, name="new_eager_tensor")
         self.assertEqual(egr_tensor3.persistable, False)
         self.assertTrue("new_eager_tensor" in egr_tensor3.name)
@@ -328,7 +328,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor3.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor3.stop_gradient, True)
 
-        egr_tensor4 = core.eager.EagerTensor(
+        egr_tensor4 = core.eager.Tensor(
             arr, place=place, persistable=True, name="new_eager_tensor")
         self.assertEqual(egr_tensor4.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor4.name)
@@ -337,7 +337,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor4.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor4.stop_gradient, True)
 
-        egr_tensor5 = core.eager.EagerTensor(
+        egr_tensor5 = core.eager.Tensor(
             arr,
             core.CPUPlace(),
             persistable=True,
@@ -350,7 +350,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor5.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor5.stop_gradient, True)
 
-        egr_tensor6 = core.eager.EagerTensor(
+        egr_tensor6 = core.eager.Tensor(
             arr,
             place=core.CPUPlace(),
             persistable=True,
@@ -363,7 +363,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor6.stop_gradient, True)
 
-        egr_tensor7 = core.eager.EagerTensor(
+        egr_tensor7 = core.eager.Tensor(
             arr,
             place=place,
             persistable=True,
@@ -376,7 +376,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor7.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor7.stop_gradient, True)
 
-        egr_tensor8 = core.eager.EagerTensor(
+        egr_tensor8 = core.eager.Tensor(
             arr,
             place=place,
             persistable=True,
@@ -390,7 +390,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor8.stop_gradient, False)
 
-        egr_tensor9 = core.eager.EagerTensor(
+        egr_tensor9 = core.eager.Tensor(
             arr, place, True, True, "new_eager_tensor", stop_gradient=False)
         self.assertEqual(egr_tensor9.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor9.name)
@@ -399,7 +399,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor9.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor9.stop_gradient, False)
 
-        egr_tensor10 = core.eager.EagerTensor(
+        egr_tensor10 = core.eager.Tensor(
             arr,
             place,
             True,
@@ -413,7 +413,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor10.stop_gradient, False)
 
-        egr_tensor11 = core.eager.EagerTensor(
+        egr_tensor11 = core.eager.Tensor(
             arr,
             place,
             True,
@@ -427,7 +427,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor11.stop_gradient, False)
 
-        egr_tensor12 = core.eager.EagerTensor(
+        egr_tensor12 = core.eager.Tensor(
             arr,
             place,
             persistable=True,
@@ -441,7 +441,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor12.stop_gradient, False)
 
-        egr_tensor13 = core.eager.EagerTensor(
+        egr_tensor13 = core.eager.Tensor(
             value=arr,
             place=place,
             persistable=True,
@@ -456,7 +456,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor13.stop_gradient, False)
 
         # special case
-        egr_tensor14 = core.eager.EagerTensor(
+        egr_tensor14 = core.eager.Tensor(
             dtype=core.VarDesc.VarType.FP32,
             dims=[4, 16, 16, 32],
             name="special_eager_tensor",
@@ -467,8 +467,8 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertEqual(egr_tensor14.shape, [4, 16, 16, 32])
         self.assertEqual(egr_tensor14.dtype, core.VarDesc.VarType.FP32)
 
-        # init EagerTensor by EagerTensor
-        egr_tensor15 = core.eager.EagerTensor(value=egr_tensor4)
+        # init Tensor by Tensor
+        egr_tensor15 = core.eager.Tensor(value=egr_tensor4)
         self.assertEqual(egr_tensor15.persistable, True)
         self.assertTrue("generated" in egr_tensor15.name)
         self.assertEqual(egr_tensor15.shape, egr_tensor4.shape)
@@ -480,7 +480,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor15.numpy(), egr_tensor4.numpy()))
 
-        egr_tensor16 = core.eager.EagerTensor(
+        egr_tensor16 = core.eager.Tensor(
             value=egr_tensor4, name="new_eager_tensor")
         self.assertEqual(egr_tensor16.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor16.name)
@@ -493,7 +493,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor16.numpy(), egr_tensor4.numpy()))
 
-        egr_tensor17 = core.eager.EagerTensor(
+        egr_tensor17 = core.eager.Tensor(
             value=egr_tensor4,
             place=place,
             name="new_eager_tensor", )
@@ -506,7 +506,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor17.numpy(), egr_tensor4.numpy()))
 
-        egr_tensor18 = core.eager.EagerTensor(
+        egr_tensor18 = core.eager.Tensor(
             egr_tensor4,
             place=place,
             name="new_eager_tensor", )
@@ -519,7 +519,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(egr_tensor18.numpy(), egr_tensor4.numpy()))
 
-        egr_tensor19 = core.eager.EagerTensor(
+        egr_tensor19 = core.eager.Tensor(
             egr_tensor4,
             place,
             name="new_eager_tensor", )
@@ -536,7 +536,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         x = np.random.rand(3, 3).astype('float32')
         t = paddle.fluid.Tensor()
         t.set(x, paddle.fluid.CPUPlace())
-        egr_tensor20 = core.eager.EagerTensor(value=t)
+        egr_tensor20 = core.eager.Tensor(value=t)
         self.assertEqual(egr_tensor20.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor20.name)
         self.assertEqual(egr_tensor20.shape, [3, 3])
@@ -547,7 +547,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
                 paddle.fluid.framework._current_expected_place()))
         self.assertTrue(np.array_equal(egr_tensor20.numpy(), x))
 
-        egr_tensor21 = core.eager.EagerTensor(value=t, place=place)
+        egr_tensor21 = core.eager.Tensor(value=t, place=place)
         self.assertEqual(egr_tensor21.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor21.name)
         self.assertEqual(egr_tensor21.shape, [3, 3])
@@ -556,7 +556,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor21.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor21.numpy(), x))
 
-        egr_tensor22 = core.eager.EagerTensor(t, place=place)
+        egr_tensor22 = core.eager.Tensor(t, place=place)
         self.assertEqual(egr_tensor22.persistable, False)
         self.assertTrue("generated_tensor" in egr_tensor22.name)
         self.assertEqual(egr_tensor22.shape, [3, 3])
@@ -565,8 +565,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor22.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor22.numpy(), x))
 
-        egr_tensor23 = core.eager.EagerTensor(
-            t, place, name="from_framework_tensor")
+        egr_tensor23 = core.eager.Tensor(t, place, name="from_framework_tensor")
         self.assertEqual(egr_tensor23.persistable, False)
         self.assertTrue("from_framework_tensor" in egr_tensor23.name)
         self.assertEqual(egr_tensor23.shape, [3, 3])
@@ -575,7 +574,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         self.assertTrue(egr_tensor23.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor23.numpy(), x))
 
-        egr_tensor24 = core.eager.EagerTensor(
+        egr_tensor24 = core.eager.Tensor(
             value=t, place=place, name="from_framework_tensor")
         self.assertEqual(egr_tensor24.persistable, False)
         self.assertTrue("from_framework_tensor" in egr_tensor24.name)
@@ -587,7 +586,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
 
         # Bad usage
         # SyntaxError: positional argument follows keyword argument
-        # egr_tensor25 = core.eager.EagerTensor(value=t, place) 
+        # egr_tensor25 = core.eager.Tensor(value=t, place) 
 
     def test_constructor_with_kwargs(self):
         print("Test_constructor_with_kwargs")
@@ -655,7 +654,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
             tensor2 = None
             tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32,
                                       core.CPUPlace())
-            tensor3 = core.eager.EagerTensor()
+            tensor3 = core.eager.Tensor()
             if core.is_compiled_with_cuda():
                 tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
                                            core.CUDAPlace(0))
@@ -683,7 +682,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
             tensor2 = None
             tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32,
                                       core.CPUPlace())
-            tensor3 = core.eager.EagerTensor()
+            tensor3 = core.eager.Tensor()
             if core.is_compiled_with_cuda():
                 tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
                                            core.CUDAPlace(0))
@@ -748,7 +747,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
         with _test_eager_guard():
             arr = np.random.rand(4, 16, 16, 32).astype('float64')
 
-            egr_tensor0 = core.eager.EagerTensor(value=arr)
+            egr_tensor0 = core.eager.Tensor(value=arr)
             self.assertEqual(egr_tensor0.persistable, False)
             self.assertTrue("generated" in egr_tensor0.name)
             self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
@@ -766,7 +765,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
     def test_set_value(self):
         with _test_eager_guard():
             ori_arr = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor = core.eager.EagerTensor(value=ori_arr)
+            egr_tensor = core.eager.Tensor(value=ori_arr)
             self.assertEqual(egr_tensor.stop_gradient, True)
             self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
             self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
@@ -859,7 +858,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
     def test_backward_with_single_tensor(self):
         with _test_eager_guard():
             arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor12 = core.eager.EagerTensor(arr4, core.CPUPlace())
+            egr_tensor12 = core.eager.Tensor(arr4, core.CPUPlace())
             egr_tensor12.retain_grads()
             arr = np.ones([4, 16, 16, 32]).astype('float32')
             self.assertEqual(egr_tensor12.persistable, False)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 92d3dd7b6054b685cb5b560c20ebf2e249f640fe..a36b10f58ffaa503b6ccca580843f07b4bbfc2ac 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -203,7 +203,7 @@ class TestImperative(unittest.TestCase):
         with fluid.dygraph.guard():
             if fluid.framework._in_eager_mode():
                 var_base = paddle.to_tensor(np.array([3, 4, 5]))
-                self.assertTrue(isinstance(var_base, core.eager.EagerTensor))
+                self.assertTrue(isinstance(var_base, core.eager.Tensor))
             else:
                 var_base = paddle.to_tensor(np.array([3, 4, 5]))
                 self.assertTrue(isinstance(var_base, core.VarBase))
@@ -221,13 +221,13 @@ class TestImperative(unittest.TestCase):
         t.set(x, fluid.CPUPlace())
         if _in_eager_mode():
             # TODO(jiabin): Support Kwargs and uncomment these tests
-            # egr_tmp = fluid.core.eager.EagerTensor(value=x, place=fluid.core.CPUPlace())
-            egr_tmp2 = fluid.core.eager.EagerTensor(y, fluid.core.CPUPlace())
+            # egr_tmp = fluid.core.eager.Tensor(value=x, place=fluid.core.CPUPlace())
+            egr_tmp2 = fluid.core.eager.Tensor(y, fluid.core.CPUPlace())
             egr_tmp3 = paddle.to_tensor(x)
-            egr_tmp4 = fluid.core.eager.EagerTensor(y)
-            # egr_tmp5 = fluid.core.eager.EagerTensor(value=x)
+            egr_tmp4 = fluid.core.eager.Tensor(y)
+            # egr_tmp5 = fluid.core.eager.Tensor(value=x)
             # TODO(jiabin): Support it when we merge LoDTensor with DenseTensor
-            egr_tmp6 = fluid.core.eager.EagerTensor(t)
+            egr_tmp6 = fluid.core.eager.Tensor(t)
 
             # self.assertTrue(np.array_equal(x, egr_tmp.numpy()))
             self.assertTrue(np.array_equal(y, egr_tmp2.numpy()))
@@ -953,8 +953,7 @@ class TestMetaclass(unittest.TestCase):
         self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
         if core._in_eager_mode():
             self.assertEqual(
-                type(paddle.fluid.core.eager.EagerTensor).__name__,
-                'pybind11_type')
+                type(paddle.fluid.core.eager.Tensor).__name__, 'pybind11_type')
         else:
             self.assertEqual(
                 type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
index 317353684317f6fa0e8cf37cda58f2041e70befd..4c457e9345c5d35aef1d221b1f744e4f93367eec 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,14 +25,15 @@ import paddle.fluid.core as core
 import paddle.fluid.dygraph.base as base
 
 from test_imperative_lod_tensor_to_selected_rows import SimpleNet
+from paddle.fluid.framework import _test_eager_guard
 
-call_forward_hook = False
+call_forward_post_hook = False
 call_forward_pre_hook = False
 
 
-def forward_hook(layer, input, output):
-    global call_forward_hook
-    call_forward_hook = True
+def forward_post_hook(layer, input, output):
+    global call_forward_post_hook
+    call_forward_post_hook = True
 
 
 def forward_pre_hook(layer, input):
@@ -40,7 +41,7 @@ def forward_pre_hook(layer, input):
     call_forward_pre_hook = True
 
 
-def forward_hook1(layer, input, output):
+def forward_post_hook1(layer, input, output):
     return output * 2
 
 
@@ -50,8 +51,8 @@ def forward_pre_hook1(layer, input):
 
 
 class Test_Forward_Hook(unittest.TestCase):
-    # test forward_pre_hook and forward_hook that have return value
-    def test_forward_hook_return_value(self):
+    # test forward_pre_hook and forward_post_hook that have return value
+    def func_forward_hook_return_value(self):
         seed = 90
 
         places = [fluid.CPUPlace()]
@@ -104,23 +105,23 @@ class Test_Forward_Hook(unittest.TestCase):
                 self.assertTrue(
                     np.array_equal(outs_pre_hook.numpy(), outs_origin.numpy()))
 
-                # register forward_hook
-                forward_hook_handle1 = simplenet.register_forward_post_hook(
-                    forward_hook1)
+                # register forward_posst_hook
+                forward_post_hook_handle1 = simplenet.register_forward_post_hook(
+                    forward_post_hook1)
                 outs_forward_hook = simplenet(input, y)
                 self.assertTrue(
                     np.array_equal(outs_forward_hook.numpy(),
                                    outs_origin.numpy() * 2))
 
-                # remove forward_hook
-                forward_hook_handle1.remove()
+                # remove forward_post_hook
+                forward_post_hook_handle1.remove()
                 outs_forward_hook = simplenet(input, y)
                 self.assertTrue(
                     np.array_equal(outs_forward_hook.numpy(),
                                    outs_origin.numpy()))
 
-    # test forward_pre_hook and forward_hook that don't have return value
-    def test_forward_hook(self):
+    # test forward_pre_hook and forward_post_hook that don't have return value
+    def func_forward_hook(self):
         seed = 90
 
         places = [fluid.CPUPlace()]
@@ -133,7 +134,7 @@ class Test_Forward_Hook(unittest.TestCase):
                 fluid.default_main_program().random_seed = seed
                 fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
-                global call_forward_hook
+                global call_forward_post_hook
                 global call_forward_pre_hook
 
                 input_word = np.array(
@@ -158,38 +159,45 @@ class Test_Forward_Hook(unittest.TestCase):
 
                 # origin, don't register any hook
                 outs_origin = simplenet(input, y)
-                self.assertFalse(call_forward_hook)
+                self.assertFalse(call_forward_post_hook)
                 self.assertFalse(call_forward_pre_hook)
 
-                # register forward_hook and forward_pre_hook
-                forward_hook_handle = simplenet.register_forward_post_hook(
-                    forward_hook)
+                # register forward_post_hook and forward_pre_hook
+                forward_post_hook_handle = simplenet.register_forward_post_hook(
+                    forward_post_hook)
                 forward_pre_hook_handle = simplenet.register_forward_pre_hook(
                     forward_pre_hook)
                 outs_hook = simplenet(input, y)
-                self.assertTrue(call_forward_hook)
+                self.assertTrue(call_forward_post_hook)
                 self.assertTrue(call_forward_pre_hook)
 
                 outs_hook = simplenet(input, y)
-                self.assertTrue(call_forward_hook)
+                self.assertTrue(call_forward_post_hook)
                 self.assertTrue(call_forward_pre_hook)
 
-                # remove forward_hook
-                forward_hook_handle.remove()
-                call_forward_hook = False
+                # remove forward_post_hook
+                forward_post_hook_handle.remove()
+                call_forward_post_hook = False
                 call_forward_pre_hook = False
                 outs_remove_forward_hook = simplenet(input, y)
-                self.assertFalse(call_forward_hook)
+                self.assertFalse(call_forward_post_hook)
                 self.assertTrue(call_forward_pre_hook)
 
                 # remove forward_pre_hook
                 forward_pre_hook_handle.remove()
-                call_forward_hook = False
+                call_forward_post_hook = False
                 call_forward_pre_hook = False
                 outs_remove_hook = simplenet(input, y)
-                self.assertFalse(call_forward_hook)
+                self.assertFalse(call_forward_post_hook)
                 self.assertFalse(call_forward_pre_hook)
 
+    def test_forward_hook_return_value(self):
+        with _test_eager_guard():
+            self.func_forward_hook()
+            self.func_forward_hook_return_value()
+        self.func_forward_hook()
+        self.func_forward_hook_return_value()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index 7b8d31ff030e503f872b9afd923ce4c6252a026a..1881f1bbbd4c330c522a6304ea3fe004fafbeb3b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -41,7 +41,7 @@ class TestImperativeNumpyBridge(unittest.TestCase):
             data_np[0][0] = -1
             self.assertEqual(data_np[0][0], -1)
             if _in_eager_mode():
-                # eager_mode, var2 is EagerTensor, is not subscriptable
+                # eager_mode, var2 is Tensor, is not subscriptable
                 # TODO(wuweilong): to support slice in eager mode later
                 self.assertNotEqual(var2.numpy()[0][0], -1)
             else:
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1bd5e08e28ef0f151b8c78b2537a08c66dd20e22..36038d656b7736afc94da32c29c56ce61b338cb4 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -34,6 +34,7 @@ from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph import nn
 from paddle.fluid.dygraph import base
 from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 class LayerTest(unittest.TestCase):
@@ -98,6 +99,14 @@ class TestLayer(LayerTest):
                 return ret
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                inp = np.ones([3, 3], dtype='float32')
+                x = base.to_variable(inp)
+                custom = CustomLayer(input_size=3, linear1_size=2)
+                ret = custom(x, do_linear2=False)
+                self.assertTrue(np.array_equal(ret.numpy().shape, [3, 2]))
+                ret = custom(x, do_linear2=True)
+                self.assertTrue(np.array_equal(ret.numpy().shape, [3, 1]))
             inp = np.ones([3, 3], dtype='float32')
             x = base.to_variable(inp)
             custom = CustomLayer(input_size=3, linear1_size=2)
@@ -121,6 +130,15 @@ class TestLayer(LayerTest):
             static_ret, static_ret2 = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret, ret2])
         with self.dynamic_graph():
+            with _test_eager_guard():
+                t = base.to_variable(inp)
+                dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
+                dy_eager_ret = dropout(t)
+                dy_eager_ret2 = fluid.layers.dropout(
+                    t, dropout_prob=0.35, seed=1, is_test=False)
+                dy_eager_ret_value = dy_eager_ret.numpy()
+                dy_eager_ret2_value = dy_eager_ret2.numpy()
+
             t = base.to_variable(inp)
             dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
             dy_ret = dropout(t)
@@ -129,6 +147,9 @@ class TestLayer(LayerTest):
             dy_ret_value = dy_ret.numpy()
             dy_ret2_value = dy_ret2.numpy()
 
+        self.assertTrue(np.array_equal(dy_eager_ret_value, dy_eager_ret2_value))
+        self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value))
+
         self.assertTrue(np.array_equal(static_ret, static_ret2))
         self.assertTrue(np.array_equal(dy_ret_value, dy_ret2_value))
         self.assertTrue(np.array_equal(static_ret, dy_ret_value))
@@ -147,12 +168,22 @@ class TestLayer(LayerTest):
             static_ret = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                t = base.to_variable(inp)
+                linear = nn.Linear(
+                    32,
+                    4,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                dy_eager_ret = linear(t)
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             t = base.to_variable(inp)
             linear = nn.Linear(
                 32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1))
             dy_ret = linear(t)
             dy_ret_value = dy_ret.numpy()
 
+        self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value))
         self.assertTrue(np.array_equal(static_ret, dy_ret_value))
 
         with self.static_graph():
@@ -193,11 +224,18 @@ class TestLayer(LayerTest):
             static_ret = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                t = base.to_variable(inp)
+                flatten = nn.Flatten()
+                dy_eager_ret = flatten(t)
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             t = base.to_variable(inp)
             flatten = nn.Flatten()
             dy_ret = flatten(t)
             dy_ret_value = dy_ret.numpy()
 
+        self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value))
         self.assertTrue(np.array_equal(static_ret, dy_ret_value))
 
         with self.static_graph():
@@ -253,13 +291,35 @@ class TestLayer(LayerTest):
             static_ret2 = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                lm = nn.LayerNorm(
+                    normalized_shape=[32, 32],
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    act='sigmoid')
+                dy_eager_ret = lm(base.to_variable(inp))
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             lm = nn.LayerNorm(
                 normalized_shape=[32, 32],
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 act='sigmoid')
             dy_ret = lm(base.to_variable(inp))
             dy_ret_value = dy_ret.numpy()
+
         with self.dynamic_graph():
+            with _test_eager_guard():
+                lm = nn.LayerNorm(
+                    normalized_shape=[32, 32],
+                    shift=False,
+                    scale=False,
+                    param_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    act='sigmoid')
+                lm(base.to_variable(inp))
+
+                self.assertFalse(hasattr(lm, "_scale_w"))
+                self.assertFalse(hasattr(lm, "_bias_w"))
+
             lm = nn.LayerNorm(
                 normalized_shape=[32, 32],
                 shift=False,
@@ -273,9 +333,18 @@ class TestLayer(LayerTest):
             self.assertFalse(hasattr(lm, "_bias_w"))
 
         self.assertTrue(np.array_equal(static_ret, static_ret2))
+        self.assertTrue(np.array_equal(dy_eager_ret_value, static_ret2))
         self.assertTrue(np.array_equal(dy_ret_value, static_ret2))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                lm = nn.LayerNorm(
+                    normalized_shape=[16, 32],
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    act='sigmoid')
+                with self.assertRaises(ValueError):
+                    lm(base.to_variable(inp))
+
             lm = nn.LayerNorm(
                 normalized_shape=[16, 32],
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
@@ -295,11 +364,18 @@ class TestLayer(LayerTest):
                     fetch_list=[ret])[0]
 
             with self.dynamic_graph():
+                with _test_eager_guard():
+                    t = np.ones([3, 3, 5, 5], dtype='float32')
+                    my_syncbn = paddle.nn.SyncBatchNorm(3)
+                    dy_eager_ret = my_syncbn(base.to_variable(t))
+                    dy_eager_ret_value = dy_eager_ret.numpy()
+
                 t = np.ones([3, 3, 5, 5], dtype='float32')
                 my_syncbn = paddle.nn.SyncBatchNorm(3)
                 dy_ret = my_syncbn(base.to_variable(t))
                 dy_ret_value = dy_ret.numpy()
             self.assertTrue(np.array_equal(static_ret, dy_ret_value))
+            self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value))
 
     def test_relu(self):
         with self.static_graph():
@@ -310,11 +386,17 @@ class TestLayer(LayerTest):
                     [3, 3], dtype='float32')}, fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                t = np.ones([3, 3], dtype='float32')
+                dy_eager_ret = layers.relu(base.to_variable(t))
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             t = np.ones([3, 3], dtype='float32')
             dy_ret = layers.relu(base.to_variable(t))
             dy_ret_value = dy_ret.numpy()
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_ret_value))
 
     def test_matmul(self):
         with self.static_graph():
@@ -331,12 +413,20 @@ class TestLayer(LayerTest):
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                t = np.ones([3, 3], dtype='float32')
+                t2 = np.ones([3, 3], dtype='float32')
+                dy_eager_ret = layers.matmul(
+                    base.to_variable(t), base.to_variable(t2))
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             t = np.ones([3, 3], dtype='float32')
             t2 = np.ones([3, 3], dtype='float32')
             dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
             dy_ret_value = dy_ret.numpy()
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_ret_value))
 
     def test_conv2d(self):
         with self.static_graph():
@@ -358,6 +448,13 @@ class TestLayer(LayerTest):
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 5, 5], dtype='float32')
+                conv2d = nn.Conv2D(
+                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                dy_eager_ret = conv2d(base.to_variable(images))
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             images = np.ones([2, 3, 5, 5], dtype='float32')
             conv2d = nn.Conv2D(
                 num_channels=3, num_filters=3, filter_size=[2, 2])
@@ -365,6 +462,16 @@ class TestLayer(LayerTest):
             dy_ret_value = dy_ret.numpy()
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 5, 5], dtype='float32')
+                conv2d = nn.Conv2D(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=[2, 2],
+                    bias_attr=False)
+                dy_ret = conv2d(base.to_variable(images))
+                self.assertTrue(conv2d.bias is None)
+
             images = np.ones([2, 3, 5, 5], dtype='float32')
             conv2d = nn.Conv2D(
                 num_channels=3,
@@ -396,9 +503,49 @@ class TestLayer(LayerTest):
             self.assertRaises(TypeError, test_type)
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_ret_value))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 5, 5], dtype='float32')
+                custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                conv2d1 = nn.Conv2D(
+                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                conv2d2 = nn.Conv2D(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=[2, 2],
+                    param_attr=weight_attr)
+                dy_ret1 = conv2d1(base.to_variable(images))
+                dy_ret2 = conv2d2(base.to_variable(images))
+                self.assertFalse(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv2d1_weight_np = conv2d1.weight.numpy()
+                conv2d1_bias = conv2d1.bias
+                self.assertFalse(
+                    np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy()))
+                conv2d2.weight.set_value(conv2d1_weight_np)
+                self.assertTrue(
+                    np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy()))
+                conv2d2.bias.set_value(conv2d1_bias)
+                dy_ret1 = conv2d1(base.to_variable(images))
+                dy_ret2 = conv2d2(base.to_variable(images))
+                self.assertTrue(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv2d2.weight = conv2d1.weight
+                conv2d2.bias = conv2d1.bias
+                self.assertTrue(
+                    np.array_equal(conv2d1.weight.numpy(),
+                                   conv2d2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy()))
+
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
@@ -467,6 +614,14 @@ class TestLayer(LayerTest):
                 fetch_list=[updated_hidden, reset_hidden_pre, gate])
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                gru = nn.GRUUnit(size=D * 3)
+                dy_eager_ret = gru(
+                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_eager_ret_value = []
+                for i in range(len(static_ret)):
+                    dy_eager_ret_value.append(dy_eager_ret[i].numpy())
+
             gru = nn.GRUUnit(size=D * 3)
             dy_ret = gru(
                 base.to_variable(input), base.to_variable(hidden_input))
@@ -477,8 +632,40 @@ class TestLayer(LayerTest):
         for i in range(len(static_ret)):
             self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
             self.assertTrue(np.allclose(static_ret[i], dy_ret_value[i]))
+            self.assertTrue(np.allclose(static_ret[i], dy_eager_ret_value[i]))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                custom_weight = np.random.randn(D, D * 3).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                gru1 = nn.GRUUnit(size=D * 3)
+                gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
+                dy_ret1 = gru1(
+                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_ret2 = gru2(
+                    base.to_variable(input), base.to_variable(hidden_input))
+                self.assertFalse(
+                    np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()))
+                for o1, o2 in zip(dy_ret1, dy_ret2):
+                    self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
+                gru2.weight.set_value(gru1.weight.numpy())
+                gru2.bias.set_value(gru1.bias)
+                dy_ret1 = gru1(
+                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_ret2 = gru2(
+                    base.to_variable(input), base.to_variable(hidden_input))
+                for o1, o2 in zip(dy_ret1, dy_ret2):
+                    self.assertTrue(np.array_equal(o1.numpy(), o2.numpy()))
+
+                gru2.weight = gru1.weight
+                gru2.bias = gru1.bias
+                self.assertTrue(
+                    np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(gru1.bias.numpy(), gru2.bias.numpy()))
+
             custom_weight = np.random.randn(D, D * 3).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -543,19 +730,37 @@ class TestLayer(LayerTest):
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                ret = layers.elementwise_add(to_variable(n), to_variable(n2))
+                ret = layers.elementwise_pow(ret, to_variable(n3))
+                ret = layers.elementwise_div(ret, to_variable(n4))
+                ret = layers.elementwise_sub(ret, to_variable(n5))
+                dy_eager_ret = layers.elementwise_mul(ret, to_variable(n6))
+                dy_eager_ret_value = dy_eager_ret.numpy()
+
             ret = layers.elementwise_add(to_variable(n), to_variable(n2))
             ret = layers.elementwise_pow(ret, to_variable(n3))
             ret = layers.elementwise_div(ret, to_variable(n4))
             ret = layers.elementwise_sub(ret, to_variable(n5))
             dy_ret = layers.elementwise_mul(ret, to_variable(n6))
             dy_ret_value = dy_ret.numpy()
+
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_ret_value))
 
     def test_elementwise_minmax(self):
         n = np.ones([3, 3], dtype='float32')
         n2 = np.ones([3, 3], dtype='float32') * 2
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                min_eager_ret = layers.elementwise_min(
+                    to_variable(n), to_variable(n2))
+                max_eager_ret = layers.elementwise_max(
+                    to_variable(n), to_variable(n2))
+                min_eager_ret_value = min_eager_ret.numpy()
+                max_eager_ret_value = max_eager_ret.numpy()
+
             min_ret = layers.elementwise_min(to_variable(n), to_variable(n2))
             max_ret = layers.elementwise_max(to_variable(n), to_variable(n2))
             min_ret_value = min_ret.numpy()
@@ -563,6 +768,8 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(n, min_ret_value))
         self.assertTrue(np.allclose(n2, max_ret_value))
+        self.assertTrue(np.allclose(n, min_eager_ret_value))
+        self.assertTrue(np.allclose(n2, max_eager_ret_value))
 
     def test_sequence_conv(self):
         inp_np = np.arange(12).reshape([3, 4]).astype('float32')
@@ -633,6 +840,16 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                conv2d_transpose = nn.Conv2DTranspose(
+                    num_channels=3,
+                    num_filters=10,
+                    filter_size=27,
+                    act='sigmoid',
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                dy_eager_rlt = conv2d_transpose(base.to_variable(inp_np))
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             conv2d_transpose = nn.Conv2DTranspose(
                 num_channels=3,
                 num_filters=10,
@@ -643,8 +860,48 @@ class TestLayer(LayerTest):
             dy_rlt_value = dy_rlt.numpy()
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt2))
+        self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt2))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 5, 5], dtype='float32')
+                custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                conv2d1 = nn.Conv2DTranspose(
+                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                conv2d2 = nn.Conv2DTranspose(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=[2, 2],
+                    param_attr=weight_attr)
+                dy_ret1 = conv2d1(base.to_variable(images))
+                dy_ret2 = conv2d2(base.to_variable(images))
+                self.assertFalse(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv2d1_weight_np = conv2d1.weight.numpy()
+                conv2d1_bias = conv2d1.bias
+                self.assertFalse(
+                    np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy()))
+                conv2d2.weight.set_value(conv2d1_weight_np)
+                self.assertTrue(
+                    np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy()))
+                conv2d2.bias.set_value(conv2d1_bias)
+                dy_ret1 = conv2d1(base.to_variable(images))
+                dy_ret2 = conv2d2(base.to_variable(images))
+                self.assertTrue(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv2d2.weight = conv2d1.weight
+                conv2d2.bias = conv2d1.bias
+                self.assertTrue(
+                    np.array_equal(conv2d1.weight.numpy(),
+                                   conv2d2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy()))
+
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
@@ -750,6 +1007,17 @@ class TestLayer(LayerTest):
                 feed={'x': inp_np_x,
                       'y': inp_np_y}, fetch_list=[out])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                btp = nn.BilinearTensorProduct(
+                    3,
+                    3,
+                    6,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    act='sigmoid')
+                dy_eager_rlt = btp(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             btp = nn.BilinearTensorProduct(
                 3,
                 3,
@@ -758,11 +1026,19 @@ class TestLayer(LayerTest):
                 act='sigmoid')
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
             dy_rlt_value = dy_rlt.numpy()
+
         with self.dynamic_graph():
+            with _test_eager_guard():
+                btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
+                dy_eager_rlt2 = btp2(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_eager_rlt2_value = dy_eager_rlt2.numpy()
+
             btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
             dy_rlt2 = btp2(
                 base.to_variable(inp_np_x), base.to_variable(inp_np_y))
             dy_rlt2_value = dy_rlt2.numpy()
+
         with self.static_graph():
             data_x2 = layers.data(
                 name='x',
@@ -782,10 +1058,42 @@ class TestLayer(LayerTest):
                       'y': inp_np_y}, fetch_list=[out2])[0]
 
         self.assertTrue(np.array_equal(dy_rlt2_value, static_rlt3))
+        self.assertTrue(np.array_equal(dy_eager_rlt2_value, static_rlt3))
         self.assertTrue(np.array_equal(static_rlt2, static_rlt))
         self.assertTrue(np.array_equal(dy_rlt_value, static_rlt))
+        self.assertTrue(np.array_equal(dy_eager_rlt_value, static_rlt))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                custom_weight = np.random.randn(6, 3, 3).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
+                btp2 = nn.BilinearTensorProduct(
+                    3, 3, 6, act='sigmoid', param_attr=weight_attr)
+                dy_rlt1 = btp1(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_rlt2 = btp2(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                self.assertFalse(
+                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+                btp2.weight.set_value(btp1.weight.numpy())
+                btp2.bias.set_value(btp1.bias)
+                dy_rlt1 = btp1(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_rlt2 = btp2(
+                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                self.assertTrue(
+                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+
+                btp2.weight = btp1.weight
+                btp2.bias = btp1.bias
+                self.assertTrue(
+                    np.array_equal(btp1.weight.numpy(), btp2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(btp1.bias.numpy(), btp2.bias.numpy()))
+
             custom_weight = np.random.randn(6, 3, 3).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -842,6 +1150,15 @@ class TestLayer(LayerTest):
                 feed={"input": inp_np}, fetch_list=[out])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                prelu = nn.PRelu(
+                    mode=mode,
+                    channel=inp_np.shape[1],
+                    input_shape=inp_np.shape,
+                    param_attr=ParamAttr(initializer=Constant(1.0)))
+                dy_eager_rlt = prelu(base.to_variable(inp_np))
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             prelu = nn.PRelu(
                 mode=mode,
                 channel=inp_np.shape[1],
@@ -852,8 +1169,40 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
+        self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                inp_np = np.random.randn(5, 200, 100, 100).astype("float32")
+                inp = base.to_variable(inp_np)
+                prelu1 = nn.PRelu(
+                    mode=mode,
+                    channel=inp_np.shape[1],
+                    input_shape=inp_np.shape,
+                    param_attr=ParamAttr(initializer=Constant(2.0)))
+                prelu2 = nn.PRelu(
+                    mode=mode,
+                    channel=inp_np.shape[1],
+                    input_shape=inp_np.shape,
+                    param_attr=ParamAttr(initializer=Constant(1.0)))
+                dy_rlt1 = prelu1(inp)
+                dy_rlt2 = prelu2(inp)
+                self.assertFalse(
+                    np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy(
+                    )))
+                self.assertFalse(
+                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+                prelu2.weight.set_value(prelu1.weight.numpy())
+                dy_rlt1 = prelu1(inp)
+                dy_rlt2 = prelu2(inp)
+                self.assertTrue(
+                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+
+                prelu2.weight = prelu1.weight
+                self.assertTrue(
+                    np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy(
+                    )))
+
             inp_np = np.random.randn(5, 200, 100, 100).astype("float32")
             inp = base.to_variable(inp_np)
             prelu1 = nn.PRelu(
@@ -905,6 +1254,14 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed={'word': inp_word}, fetch_list=[emb_rlt])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                emb2 = nn.Embedding(
+                    size=[dict_size, 32],
+                    param_attr='eager_emb.w',
+                    is_sparse=False)
+                dy_eager_rlt = emb2(base.to_variable(inp_word))
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             emb2 = nn.Embedding(
                 size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
             dy_rlt = emb2(base.to_variable(inp_word))
@@ -912,8 +1269,34 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
+        self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                custom_weight = np.random.randn(dict_size, 32).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False)
+                emb2 = nn.Embedding(
+                    size=[dict_size, 32],
+                    param_attr=weight_attr,
+                    is_sparse=False)
+                rep1 = emb1(base.to_variable(inp_word))
+                rep2 = emb2(base.to_variable(inp_word))
+                self.assertFalse(
+                    np.array_equal(emb1.weight.numpy(), custom_weight))
+                self.assertTrue(
+                    np.array_equal(emb2.weight.numpy(), custom_weight))
+                self.assertFalse(np.array_equal(rep1.numpy(), rep2.numpy()))
+                emb2.weight.set_value(emb1.weight.numpy())
+                rep2 = emb2(base.to_variable(inp_word))
+                self.assertTrue(np.array_equal(rep1.numpy(), rep2.numpy()))
+
+                emb2.weight = emb1.weight
+                self.assertTrue(
+                    np.array_equal(emb1.weight.numpy(), emb2.weight.numpy()))
+
             custom_weight = np.random.randn(dict_size, 32).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -978,6 +1361,7 @@ class TestLayer(LayerTest):
                 feed_dict['word_{0}'.format(i)] = inp_word[i]
             static_rlt = self.get_static_graph_result(
                 feed=feed_dict, fetch_list=[nce_loss])[0]
+
         with self.static_graph():
             words = []
             for i in range(window_size):
@@ -1018,6 +1402,41 @@ class TestLayer(LayerTest):
                 feed=feed_dict, fetch_list=[nce_loss2])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                words = []
+                for i in range(window_size):
+                    words.append(base.to_variable(inp_word[i]))
+                sample_weights = layers.fill_constant(
+                    shape=[5, 1], dtype='float32', value=1)
+                emb = nn.Embedding(
+                    size=[dict_size, 32],
+                    param_attr='eager_emb.w',
+                    is_sparse=False)
+
+                embs3 = []
+                for i in range(window_size):
+                    if i == label_word:
+                        continue
+
+                    emb_rlt = emb(words[i])
+                    embs3.append(emb_rlt)
+
+                embs3 = layers.concat(
+                    input=embs3, axis=fluid.dygraph.to_variable(np.array([1])))
+                nce = nn.NCE(num_total_classes=dict_size,
+                             dim=embs3.shape[1],
+                             num_neg_samples=2,
+                             sampler="custom_dist",
+                             custom_dist=nid_freq_arr.tolist(),
+                             seed=seed,
+                             param_attr='eager_nce.w',
+                             bias_attr='eager_nce.b',
+                             sample_weight=sample_weights)
+
+                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
+                dy_eager_rlt = nce(embs3, wl)
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
@@ -1052,8 +1471,75 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
+        self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                custom_weight = np.random.randn(dict_size,
+                                                128).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                words = []
+                for i in range(window_size):
+                    words.append(base.to_variable(inp_word[i]))
+                sample_weights = layers.fill_constant(
+                    shape=fluid.dygraph.to_variable(np.array([5, 1])),
+                    dtype='float32',
+                    value=1)
+                emb = nn.Embedding(
+                    size=[dict_size, 32],
+                    param_attr='eager_emb.w',
+                    is_sparse=False)
+
+                embs3 = []
+                for i in range(window_size):
+                    if i == label_word:
+                        continue
+
+                    emb_rlt = emb(words[i])
+                    embs3.append(emb_rlt)
+
+                embs3 = layers.concat(input=embs3, axis=1)
+                nce1 = nn.NCE(num_total_classes=dict_size,
+                              dim=embs3.shape[1],
+                              num_neg_samples=2,
+                              sampler="custom_dist",
+                              custom_dist=nid_freq_arr.tolist(),
+                              seed=seed,
+                              param_attr='eager_nce1.w',
+                              bias_attr='eager_nce1.b',
+                              sample_weight=sample_weights)
+
+                nce2 = nn.NCE(num_total_classes=dict_size,
+                              dim=embs3.shape[1],
+                              num_neg_samples=2,
+                              sampler="custom_dist",
+                              custom_dist=nid_freq_arr.tolist(),
+                              seed=seed,
+                              param_attr=weight_attr,
+                              bias_attr='eager_nce2.b',
+                              sample_weight=sample_weights)
+
+                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
+                nce1_loss = nce1(embs3, wl)
+                nce2_loss = nce2(embs3, wl)
+                self.assertFalse(
+                    np.array_equal(nce1_loss.numpy(), nce2_loss.numpy()))
+                nce2.weight.set_value(nce1.weight.numpy())
+                nce2.bias.set_value(nce1.bias)
+                nce1_loss = nce1(embs3, wl)
+                nce2_loss = nce2(embs3, wl)
+                self.assertTrue(
+                    np.array_equal(nce1_loss.numpy(), nce2_loss.numpy()))
+
+                nce2.weight = nce1.weight
+                nce2.bias = nce1.bias
+                self.assertTrue(
+                    np.array_equal(nce1.weight.numpy(), nce2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(nce1.bias.numpy(), nce2.bias.numpy()))
+
             custom_weight = np.random.randn(dict_size, 128).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -1118,6 +1604,17 @@ class TestLayer(LayerTest):
 
     def test_one_hot(self):
         with self.dynamic_graph():
+            with _test_eager_guard():
+                label = fluid.dygraph.to_variable(
+                    np.array([[1], [1], [3], [0]]))
+                one_hot_label1 = fluid.layers.one_hot(input=label, depth=4)
+                one_hot_label2 = fluid.layers.one_hot(
+                    input=label,
+                    depth=fluid.dygraph.to_variable(np.array([4])))
+                self.assertTrue(
+                    np.array_equal(one_hot_label1.numpy(),
+                                   one_hot_label2.numpy()))
+
             label = fluid.dygraph.to_variable(np.array([[1], [1], [3], [0]]))
             one_hot_label1 = fluid.layers.one_hot(input=label, depth=4)
             one_hot_label2 = fluid.layers.one_hot(
@@ -1127,6 +1624,16 @@ class TestLayer(LayerTest):
 
     def test_split(self):
         with self.dynamic_graph():
+            with _test_eager_guard():
+                input = fluid.dygraph.to_variable(np.random.random((3, 8, 5)))
+                x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1)
+                x00, x11 = fluid.layers.split(
+                    input,
+                    num_or_sections=2,
+                    dim=fluid.dygraph.to_variable(np.array([1])))
+                self.assertTrue(np.array_equal(x0.numpy(), x00.numpy()))
+                self.assertTrue(np.array_equal(x1.numpy(), x11.numpy()))
+
             input = fluid.dygraph.to_variable(np.random.random((3, 8, 5)))
             x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1)
             x00, x11 = fluid.layers.split(
@@ -1138,6 +1645,17 @@ class TestLayer(LayerTest):
 
     def test_topk(self):
         with self.dynamic_graph():
+            with _test_eager_guard():
+                input = fluid.dygraph.to_variable(np.random.random((13, 11)))
+                top5_values1, top5_indices1 = layers.topk(input, k=5)
+                top5_values2, top5_indices2 = layers.topk(
+                    input, k=fluid.dygraph.to_variable(np.array([5])))
+                self.assertTrue(
+                    np.array_equal(top5_values1.numpy(), top5_values2.numpy()))
+                self.assertTrue(
+                    np.array_equal(top5_indices1.numpy(), top5_indices2.numpy(
+                    )))
+
             input = fluid.dygraph.to_variable(np.random.random((13, 11)))
             top5_values1, top5_indices1 = layers.topk(input, k=5)
             top5_values2, top5_indices2 = layers.topk(
@@ -1168,15 +1686,61 @@ class TestLayer(LayerTest):
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 6, 6, 6], dtype='float32')
+                conv3d = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2)
+                dy_eager_ret = conv3d(base.to_variable(images))
+                dy_eager_rlt_value = dy_eager_ret.numpy()
+
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             conv3d = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2)
             dy_ret = conv3d(base.to_variable(images))
             dy_rlt_value = dy_ret.numpy()
 
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 6, 6, 6], dtype='float32')
+                custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                conv3d1 = nn.Conv3D(
+                    num_channels=3, num_filters=3, filter_size=2)
+                conv3d2 = nn.Conv3D(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=2,
+                    param_attr=weight_attr)
+                dy_ret1 = conv3d1(base.to_variable(images))
+                dy_ret2 = conv3d2(base.to_variable(images))
+                self.assertFalse(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv3d1_weight_np = conv3d1.weight.numpy()
+                conv3d1_bias = conv3d1.bias
+                self.assertFalse(
+                    np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy()))
+                conv3d2.weight.set_value(conv3d1_weight_np)
+                self.assertTrue(
+                    np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy()))
+                conv3d1.bias.set_value(conv3d1_bias)
+                dy_ret1 = conv3d1(base.to_variable(images))
+                dy_ret2 = conv3d2(base.to_variable(images))
+                self.assertTrue(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv3d2.weight = conv3d1.weight
+                conv3d2.bias = conv3d1.bias
+                self.assertTrue(
+                    np.array_equal(conv3d1.weight.numpy(),
+                                   conv3d2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy()))
+
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
@@ -1309,6 +1873,7 @@ class TestLayer(LayerTest):
                 with_lod=True)[0]
 
         with self.dynamic_graph():
+            # TODO(wuweilong): Add with _test_eager_guard():
             groupNorm = nn.GroupNorm(
                 channels=shape[1],
                 groups=2,
@@ -1347,17 +1912,29 @@ class TestLayer(LayerTest):
                 feed={'X': input}, fetch_list=[ret])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                instanceNorm = nn.InstanceNorm(num_channels=shape[1])
+                dy_eager_ret = instanceNorm(base.to_variable(input))
+                dy_eager_rlt_value = dy_eager_ret.numpy()
+
             instanceNorm = nn.InstanceNorm(num_channels=shape[1])
             dy_ret = instanceNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                instanceNorm = nn.InstanceNorm(num_channels=shape[1])
+                dy_eager_ret = instanceNorm(base.to_variable(input))
+                dy_eager_rlt_value2 = dy_eager_ret.numpy()
+
             instanceNorm = nn.InstanceNorm(num_channels=shape[1])
             dy_ret = instanceNorm(base.to_variable(input))
             dy_rlt_value2 = dy_ret.numpy()
 
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
         self.assertTrue(np.allclose(static_ret, dy_rlt_value2))
+        self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value2))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
         with self.static_graph():
@@ -1420,11 +1997,17 @@ class TestLayer(LayerTest):
                 with_lod=True)[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
+                dy_eager_ret = spectralNorm(base.to_variable(input))
+                dy_eager_rlt_value = dy_eager_ret.numpy()
+
             spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
             dy_ret = spectralNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
 
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_tree_conv(self):
@@ -1492,6 +2075,13 @@ class TestLayer(LayerTest):
                 with_lod=False)[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                treeConv = nn.TreeConv(
+                    feature_size=5, output_size=6, num_filters=1, max_depth=2)
+                dy_eager_ret = treeConv(
+                    base.to_variable(vectors), base.to_variable(adj))
+                dy_eager_rlt_value = dy_eager_ret.numpy()
+
             treeConv = nn.TreeConv(
                 feature_size=5, output_size=6, num_filters=1, max_depth=2)
             dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
@@ -1499,8 +2089,51 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
+        self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                custom_weight = np.random.randn(5, 3, 6, 1).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                treeConv1 = nn.TreeConv(
+                    feature_size=5,
+                    output_size=6,
+                    num_filters=1,
+                    max_depth=2,
+                    bias_attr='eager_tc1_b')
+                treeConv2 = nn.TreeConv(
+                    feature_size=5,
+                    output_size=6,
+                    num_filters=1,
+                    max_depth=2,
+                    param_attr=weight_attr,
+                    bias_attr='eager_tc2_b')
+                dy_ret1 = treeConv1(
+                    base.to_variable(vectors), base.to_variable(adj))
+                dy_ret2 = treeConv2(
+                    base.to_variable(vectors), base.to_variable(adj))
+                self.assertFalse(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                treeConv2.weight.set_value(treeConv1.weight.numpy())
+                treeConv2.bias.set_value(treeConv1.bias)
+                dy_ret1 = treeConv1(
+                    base.to_variable(vectors), base.to_variable(adj))
+                dy_ret2 = treeConv2(
+                    base.to_variable(vectors), base.to_variable(adj))
+                self.assertTrue(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                treeConv2.weight = treeConv1.weight
+                treeConv2.bias = treeConv1.bias
+                self.assertTrue(
+                    np.array_equal(treeConv1.weight.numpy(),
+                                   treeConv2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(treeConv1.bias.numpy(),
+                                   treeConv2.bias.numpy()))
+
             custom_weight = np.random.randn(5, 3, 6, 1).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -1557,14 +2190,69 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed={'pixel': input_array}, fetch_list=[out])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                conv3d_transpose = nn.Conv3DTranspose(
+                    num_channels=3,
+                    num_filters=12,
+                    filter_size=12,
+                    use_cudnn=False)
+                dy_eager_rlt = conv3d_transpose(base.to_variable(input_array))
+                dy_eager_rlt_value = dy_eager_rlt.numpy()
+
             conv3d_transpose = nn.Conv3DTranspose(
                 num_channels=3, num_filters=12, filter_size=12, use_cudnn=False)
             dy_rlt = conv3d_transpose(base.to_variable(input_array))
             dy_rlt_value = dy_rlt.numpy()
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
+        self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt))
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                images = np.ones([2, 3, 6, 6, 6], dtype='float32')
+                custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
+                weight_attr = fluid.ParamAttr(
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        custom_weight))
+                conv3d1 = nn.Conv3DTranspose(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=2,
+                    bias_attr='eager_conv3d1_b',
+                    use_cudnn=False)
+                conv3d2 = nn.Conv3DTranspose(
+                    num_channels=3,
+                    num_filters=3,
+                    filter_size=2,
+                    param_attr=weight_attr,
+                    bias_attr='eager_conv3d2_b',
+                    use_cudnn=False)
+                dy_ret1 = conv3d1(base.to_variable(images))
+                dy_ret2 = conv3d2(base.to_variable(images))
+                self.assertFalse(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv3d1_weight_np = conv3d1.weight.numpy()
+                conv3d1_bias = conv3d1.bias
+                self.assertFalse(
+                    np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy()))
+                conv3d2.weight.set_value(conv3d1_weight_np)
+                self.assertTrue(
+                    np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy()))
+                conv3d1.bias.set_value(conv3d1_bias)
+                dy_ret1 = conv3d1(base.to_variable(images))
+                dy_ret2 = conv3d2(base.to_variable(images))
+                self.assertTrue(
+                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+
+                conv3d2.weight = conv3d1.weight
+                conv3d2.bias = conv3d1.bias
+                self.assertTrue(
+                    np.array_equal(conv3d1.weight.numpy(),
+                                   conv3d2.weight.numpy()))
+                self.assertTrue(
+                    np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy()))
+
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
@@ -1614,6 +2302,20 @@ class TestLayer(LayerTest):
         stack_rlt2 = np.stack(array_rlt2, axis=0)
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                eager_eye_tensor = layers.eye(num_rows=3, num_columns=2)
+                eager_eye_tensor_rlt1 = layers.eye(num_rows=3,
+                                                   num_columns=2,
+                                                   batch_shape=[3])
+                eager_eye_tensor_rlt2 = layers.eye(num_rows=3,
+                                                   num_columns=2,
+                                                   batch_shape=[4, 3])
+                eager_diag_tensor = layers.eye(20)
+                eager_eye_tensor_value = eager_eye_tensor.numpy()
+                eager_eye_tensor_rlt1_value = eager_eye_tensor_rlt1.numpy()
+                eager_eye_tensor_rlt2_value = eager_eye_tensor_rlt2.numpy()
+                eager_diag_tensor_value = eager_diag_tensor.numpy()
+
             eye_tensor = layers.eye(num_rows=3, num_columns=2)
             eye_tensor_rlt1 = layers.eye(num_rows=3,
                                          num_columns=2,
@@ -1626,6 +2328,12 @@ class TestLayer(LayerTest):
             eye_tensor_rlt1_value = eye_tensor_rlt1.numpy()
             eye_tensor_rlt2_value = eye_tensor_rlt2.numpy()
             diag_tensor_value = diag_tensor.numpy()
+
+        self.assertTrue(np.allclose(eager_eye_tensor_value, np_eye))
+        self.assertTrue(np.allclose(eager_eye_tensor_rlt1_value, stack_rlt1))
+        self.assertTrue(np.allclose(eager_eye_tensor_rlt2_value, stack_rlt2))
+        self.assertTrue(np.allclose(eager_diag_tensor_value, np.eye(20)))
+
         self.assertTrue(np.allclose(eye_tensor_value, np_eye))
         self.assertTrue(np.allclose(eye_tensor_rlt1_value, stack_rlt1))
         self.assertTrue(np.allclose(eye_tensor_rlt2_value, stack_rlt2))
@@ -1655,6 +2363,7 @@ class TestLayer(LayerTest):
             static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
 
         with self.dynamic_graph():
+            # TODO(wuweilong): Add with _test_eager_guard():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
 
@@ -1687,6 +2396,14 @@ class TestLayer(LayerTest):
                 feed={"a": value_a,
                       "b": value_b}, fetch_list=[cond])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da = base.to_variable(value_a)
+                db = base.to_variable(value_b)
+                dcond = layers.less_than(x=da, y=db)
+
+                for i in range(len(static_ret)):
+                    self.assertTrue(dcond.numpy()[i] == static_ret[i])
+
             da = base.to_variable(value_a)
             db = base.to_variable(value_b)
             dcond = layers.less_than(x=da, y=db)
@@ -1703,6 +2420,14 @@ class TestLayer(LayerTest):
                 feed={"a1": value_a,
                       "b1": value_b}, fetch_list=[cond1])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da1 = base.to_variable(value_a)
+                db1 = base.to_variable(value_b)
+                dcond1 = layers.less_equal(x=da1, y=db1)
+
+                for i in range(len(static_ret1)):
+                    self.assertTrue(dcond1.numpy()[i] == static_ret1[i])
+
             da1 = base.to_variable(value_a)
             db1 = base.to_variable(value_b)
             dcond1 = layers.less_equal(x=da1, y=db1)
@@ -1719,6 +2444,14 @@ class TestLayer(LayerTest):
                 feed={"a2": value_a,
                       "b2": value_b}, fetch_list=[cond2])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da2 = base.to_variable(value_a)
+                db2 = base.to_variable(value_b)
+                dcond2 = layers.greater_than(x=da2, y=db2)
+
+                for i in range(len(static_ret2)):
+                    self.assertTrue(dcond2.numpy()[i] == static_ret2[i])
+
             da2 = base.to_variable(value_a)
             db2 = base.to_variable(value_b)
             dcond2 = layers.greater_than(x=da2, y=db2)
@@ -1735,6 +2468,14 @@ class TestLayer(LayerTest):
                 feed={"a3": value_a,
                       "b3": value_b}, fetch_list=[cond3])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da3 = base.to_variable(value_a)
+                db3 = base.to_variable(value_b)
+                dcond3 = layers.greater_equal(x=da3, y=db3)
+
+                for i in range(len(static_ret3)):
+                    self.assertTrue(dcond3.numpy()[i] == static_ret3[i])
+
             da3 = base.to_variable(value_a)
             db3 = base.to_variable(value_b)
             dcond3 = layers.greater_equal(x=da3, y=db3)
@@ -1751,6 +2492,14 @@ class TestLayer(LayerTest):
                 feed={"a4": value_a,
                       "b4": value_b}, fetch_list=[cond4])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da4 = base.to_variable(value_a)
+                db4 = base.to_variable(value_b)
+                dcond4 = layers.equal(x=da4, y=db4)
+
+                for i in range(len(static_ret4)):
+                    self.assertTrue(dcond4.numpy()[i] == static_ret4[i])
+
             da4 = base.to_variable(value_a)
             db4 = base.to_variable(value_b)
             dcond4 = layers.equal(x=da4, y=db4)
@@ -1767,6 +2516,14 @@ class TestLayer(LayerTest):
                 feed={"a5": value_a,
                       "b5": value_b}, fetch_list=[cond5])[0]
         with self.dynamic_graph():
+            with _test_eager_guard():
+                da5 = base.to_variable(value_a)
+                db5 = base.to_variable(value_b)
+                dcond5 = layers.equal(x=da5, y=db5)
+
+                for i in range(len(static_ret5)):
+                    self.assertTrue(dcond5.numpy()[i] == static_ret5[i])
+
             da5 = base.to_variable(value_a)
             db5 = base.to_variable(value_b)
             dcond5 = layers.equal(x=da5, y=db5)
@@ -1795,6 +2552,23 @@ class TestLayer(LayerTest):
             static_res = ret[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                a = fluid.dygraph.to_variable(np.array([0.1]).astype('float32'))
+                b = fluid.dygraph.to_variable(
+                    np.array([0.23]).astype('float32'))
+                out = layers.cond(a < b, lambda: less_than_branch(a, b),
+                                  lambda: greater_equal_branch(a, b))
+                out2 = layers.cond(a >= b, lambda: greater_equal_branch(a, b),
+                                   lambda: less_than_branch(a, b))
+                eager_dynamic_res = out.numpy()
+                eager_dynamic_res2 = out2.numpy()
+                self.assertTrue(
+                    np.array_equal(eager_dynamic_res, eager_dynamic_res2))
+                with self.assertRaises(TypeError):
+                    layers.cond(a < b, 'str', 'str')
+                with self.assertRaises(TypeError):
+                    layers.cond(a >= b, 'str', 'str')
+
             a = fluid.dygraph.to_variable(np.array([0.1]).astype('float32'))
             b = fluid.dygraph.to_variable(np.array([0.23]).astype('float32'))
             out = layers.cond(a < b, lambda: less_than_branch(a, b),
@@ -1810,6 +2584,7 @@ class TestLayer(LayerTest):
                 layers.cond(a >= b, 'str', 'str')
 
         self.assertTrue(np.array_equal(static_res, dynamic_res))
+        self.assertTrue(np.array_equal(static_res, eager_dynamic_res))
 
     def test_case(self):
         def fn_1():
@@ -1840,6 +2615,23 @@ class TestLayer(LayerTest):
             static_res1, static_res2 = exe.run(fetch_list=[out_1, out_2])
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                x = layers.fill_constant(shape=[1], dtype='float32', value=0.3)
+                y = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
+                z = layers.fill_constant(shape=[1], dtype='float32', value=0.2)
+
+                pred_1 = layers.less_than(z, x)  # true: 0.2 < 0.3
+                pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
+                pred_3 = layers.equal(x, y)  # false: 0.3 == 0.1
+
+                out_1 = layers.case(
+                    pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)],
+                    default=fn_3)
+                out_2 = layers.case(
+                    pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+                eager_dynamic_res1 = out_1.numpy()
+                eager_dynamic_res2 = out_2.numpy()
+
             x = layers.fill_constant(shape=[1], dtype='float32', value=0.3)
             y = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
             z = layers.fill_constant(shape=[1], dtype='float32', value=0.2)
@@ -1856,6 +2648,8 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.array_equal(static_res1, dynamic_res1))
         self.assertTrue(np.array_equal(static_res2, dynamic_res2))
+        self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1))
+        self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2))
 
     def test_switch_case(self):
         def fn_1():
@@ -1891,6 +2685,29 @@ class TestLayer(LayerTest):
                 fetch_list=[out_1, out_2, out_3])
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                index_1 = layers.fill_constant(
+                    shape=[1], dtype='int32', value=1)
+                index_2 = layers.fill_constant(
+                    shape=[1], dtype='int32', value=2)
+
+                out_1 = layers.switch_case(
+                    branch_index=index_1,
+                    branch_fns={1: fn_1,
+                                2: fn_2},
+                    default=fn_3)
+                out_2 = layers.switch_case(
+                    branch_index=index_2,
+                    branch_fns=[(1, fn_1), (2, fn_2)],
+                    default=fn_3)
+                out_3 = layers.switch_case(
+                    branch_index=index_2,
+                    branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
+
+                eager_dynamic_res1 = out_1.numpy()
+                eager_dynamic_res2 = out_2.numpy()
+                eager_dynamic_res3 = out_3.numpy()
+
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
 
@@ -1914,6 +2731,9 @@ class TestLayer(LayerTest):
         self.assertTrue(np.array_equal(static_res1, dynamic_res1))
         self.assertTrue(np.array_equal(static_res2, dynamic_res2))
         self.assertTrue(np.array_equal(static_res3, dynamic_res3))
+        self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1))
+        self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2))
+        self.assertTrue(np.array_equal(static_res3, eager_dynamic_res3))
 
     def test_crop_tensor(self):
         with self.static_graph():
@@ -3281,6 +4101,14 @@ class TestBook(LayerTest):
                 fetch_list=[output])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                x_dy = base.to_variable(x_np)
+                rois_dy = base.to_variable(rois_np)
+                rois_num_dy = base.to_variable(rois_num_np)
+                dy_eager_res = layers.roi_pool(
+                    x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+                dy_eager_res_value = dy_eager_res[0].numpy()
+
             x_dy = base.to_variable(x_np)
             rois_dy = base.to_variable(rois_np)
             rois_num_dy = base.to_variable(rois_num_np)
@@ -3288,6 +4116,7 @@ class TestBook(LayerTest):
                 x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
             dy_res_value = dy_res[0].numpy()
         self.assertTrue(np.array_equal(static_res, dy_res_value))
+        self.assertTrue(np.array_equal(static_res, dy_eager_res_value))
 
     def test_sequence_enumerate(self):
         # TODO(minqiyang): dygraph do not support lod now
@@ -3312,12 +4141,21 @@ class TestBook(LayerTest):
                 fetch_list=[output])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                x_dy = base.to_variable(x_np)
+                rois_dy = base.to_variable(rois_np)
+                rois_num_dy = base.to_variable(rois_num_np)
+                dy_eager_res = layers.roi_align(
+                    x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+                dy_eager_res_value = dy_eager_res.numpy()
+
             x_dy = base.to_variable(x_np)
             rois_dy = base.to_variable(rois_np)
             rois_num_dy = base.to_variable(rois_num_np)
             dy_res = layers.roi_align(
                 x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
             dy_res_value = dy_res.numpy()
+        self.assertTrue(np.array_equal(static_res, dy_eager_res_value))
         self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_dice_loss(self):
@@ -3338,11 +4176,18 @@ class TestBook(LayerTest):
                 fetch_list=[output])[0]
 
         with self.dynamic_graph():
+            with _test_eager_guard():
+                input_ = base.to_variable(input_np)
+                label_ = base.to_variable(label_np)
+                dy_eager_res = layers.dice_loss(input_, label_, eps)
+                dy_eager_res_value = dy_eager_res.numpy()
+
             input_ = base.to_variable(input_np)
             label_ = base.to_variable(label_np)
             dy_res = layers.dice_loss(input_, label_, eps)
             dy_res_value = dy_res.numpy()
         self.assertTrue(np.array_equal(static_res, dy_res_value))
+        self.assertTrue(np.array_equal(static_res, dy_eager_res_value))
 
     def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
index d13bdd676b48e38844e78469a2c36156b272f5e4..9e3edd82681bca1c7f29046a7761543ca7550d50 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
@@ -47,6 +47,7 @@ def test_dygraph_assert_true(self, x_list, p_list):
 
 
 def gen_input():
+    np.random.seed(2021)
     # generate square matrix or batches of square matrices
     input_1 = np.random.rand(5, 5).astype('float32')
     input_2 = np.random.rand(3, 6, 6).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
index 7ff6ebc0437b4c6b2e34492e91289bc11646a9ad..8d0a34009d6e589ec6cd14700faa869d63da31b2 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
@@ -36,6 +36,7 @@ class LinalgPinvTestCase(unittest.TestCase):
 
     def generate_input(self):
         self._input_shape = (5, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -102,6 +103,7 @@ class LinalgPinvTestCase(unittest.TestCase):
 class LinalgPinvTestCase1(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (4, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -109,6 +111,7 @@ class LinalgPinvTestCase1(LinalgPinvTestCase):
 class LinalgPinvTestCase2(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (5, 4)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -116,6 +119,7 @@ class LinalgPinvTestCase2(LinalgPinvTestCase):
 class LinalgPinvTestCaseBatch1(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -123,6 +127,7 @@ class LinalgPinvTestCaseBatch1(LinalgPinvTestCase):
 class LinalgPinvTestCaseBatch2(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 4, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -130,6 +135,7 @@ class LinalgPinvTestCaseBatch2(LinalgPinvTestCase):
 class LinalgPinvTestCaseBatch3(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 4)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -137,6 +143,7 @@ class LinalgPinvTestCaseBatch3(LinalgPinvTestCase):
 class LinalgPinvTestCaseBatch4(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 6, 5, 4)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -144,6 +151,7 @@ class LinalgPinvTestCaseBatch4(LinalgPinvTestCase):
 class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (2, 200, 300)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -151,6 +159,7 @@ class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase):
 class LinalgPinvTestCaseFP32(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -163,6 +172,7 @@ class LinalgPinvTestCaseFP32(LinalgPinvTestCase):
 class LinalgPinvTestCaseRcond(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         self._input_data = np.random.random(self._input_shape).astype(
             self.dtype)
 
@@ -175,6 +185,7 @@ class LinalgPinvTestCaseRcond(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype) + \
             1J * np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose().conj()
@@ -188,6 +199,7 @@ class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype) + \
             1J * np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose((0, 2, 1)).conj()
@@ -201,6 +213,7 @@ class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype) + \
             1J * np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose((0, 2, 1)).conj()
@@ -214,6 +227,7 @@ class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose()
 
@@ -226,6 +240,7 @@ class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose((0, 2, 1))
 
@@ -238,6 +253,7 @@ class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase):
 class LinalgPinvTestCaseHermitianFP32(LinalgPinvTestCase):
     def generate_input(self):
         self._input_shape = (3, 5, 5)
+        np.random.seed(123)
         x = np.random.random(self._input_shape).astype(self.dtype)
         self._input_data = x + x.transpose((0, 2, 1))
 
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index efcc0e4cfe323294df88167a6100f019cef67005..ed1495c6352bb979058d1dca015171f013fd38d9 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -16,7 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from paddle.fluid.tests.unittests.testsuite import create_op
 import paddle.fluid.core as core
 
 import paddle
@@ -73,17 +74,32 @@ class TestMatMulV2Op(OpTest):
         self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
+        if self.is_bfloat16_op():
+            x = np.random.random(self.x_shape).astype(np.float32)
+            y = np.random.random(self.y_shape).astype(np.float32)
+        else:
+            x = np.random.random(self.x_shape).astype(self.dtype)
+            y = np.random.random(self.y_shape).astype(self.dtype)
+            # -0.1 ~ 0.1
+            x = -0.1 + 0.2 * x
+            y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
-        result = result.astype(self.dtype)
-        self.inputs = {
-            'X': x,
-            'Y': y,
-        }
+        if self.is_bfloat16_op():
+            result = result.astype(np.float32)
+            self.inputs = {
+                'X': convert_float_to_uint16(x),
+                'Y': convert_float_to_uint16(y),
+            }
+            self.inputs_fp32 = {
+                'X': x,
+                'Y': y,
+            }
+        else:
+            result = result.astype(self.dtype)
+            self.inputs = {
+                'X': x,
+                'Y': y,
+            }
         self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
         self.outputs = {'Out': result}
 
@@ -97,7 +113,7 @@ class TestMatMulV2Op(OpTest):
             self.check_grad(['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -109,7 +125,7 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -121,7 +137,7 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -133,7 +149,7 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -145,7 +161,7 @@ class TestMatMuklOp5(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -157,7 +173,7 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -169,7 +185,7 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -181,7 +197,7 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -193,7 +209,7 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -205,7 +221,7 @@ class TestMatMuklOp10(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -217,7 +233,7 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -229,7 +245,7 @@ class TestMatMuklOp12(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -241,7 +257,7 @@ class TestMatMuklOp13(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -253,7 +269,7 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -265,7 +281,7 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the gradient for special case
     """
@@ -277,7 +293,7 @@ class TestMatMuklOp16(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -289,7 +305,7 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+class TestMatMulOpBroadcast1(TestMatMulV2Op):
     """
     case 14_3
     """
@@ -301,7 +317,7 @@ class TestMatMuklOpBroadcast1(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+class TestMatMulOpBroadcast2(TestMatMulV2Op):
     """
     case 14_4
     """
@@ -343,22 +359,90 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
 
 
 create_test_fp16_class(TestMatMulV2Op)
-create_test_fp16_class(TestMatMuklOp2)
-create_test_fp16_class(TestMatMuklOp3)
-create_test_fp16_class(TestMatMuklOp4)
-create_test_fp16_class(TestMatMuklOp5)
-create_test_fp16_class(TestMatMuklOp6)
-create_test_fp16_class(TestMatMuklOp7)
-create_test_fp16_class(TestMatMuklOp8)
-create_test_fp16_class(TestMatMuklOp9)
-create_test_fp16_class(TestMatMuklOp10)
-create_test_fp16_class(TestMatMuklOp11)
-create_test_fp16_class(TestMatMuklOp12)
-create_test_fp16_class(TestMatMuklOp13)
-create_test_fp16_class(TestMatMuklOp14)
-create_test_fp16_class(TestMatMuklOp15)
-create_test_fp16_class(TestMatMuklOp16)
-create_test_fp16_class(TestMatMuklOp17)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp7)
+create_test_fp16_class(TestMatMulOp8)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+create_test_fp16_class(TestMatMulOp14)
+create_test_fp16_class(TestMatMulOp15)
+create_test_fp16_class(TestMatMulOp16)
+create_test_fp16_class(TestMatMulOp17)
+
+#--------------------test matmul bf16--------------------
+
+
+def create_test_bf16_class(parent, atol=0.01):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
+    )
+    class TestMatMulOpBf16Case(parent):
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(scope, self.op_type, self.inputs, self.outputs,
+                           self.attrs)
+            return get_numeric_gradient(place, scope, op, self.inputs_fp32,
+                                        check_name, ['Out'])
+
+        def init_kernel_type(self):
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad_x(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'X')
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set(['Y']),
+                user_defined_grads=[numeric_grads])
+
+        def test_check_grad_y(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Y')
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set(['X']),
+                user_defined_grads=[numeric_grads])
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Bf16")
+    TestMatMulOpBf16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpBf16Case
+
+
+create_test_bf16_class(TestMatMulV2Op)
+create_test_bf16_class(TestMatMulOp2)
+create_test_bf16_class(TestMatMulOp3)
+create_test_bf16_class(TestMatMulOp4)
+create_test_bf16_class(TestMatMulOp5)
+create_test_bf16_class(TestMatMulOp6)
+create_test_bf16_class(TestMatMulOp7)
+create_test_bf16_class(TestMatMulOp8)
+create_test_bf16_class(TestMatMulOp9)
+create_test_bf16_class(TestMatMulOp10)
+create_test_bf16_class(TestMatMulOp11)
+create_test_bf16_class(TestMatMulOp12)
+create_test_bf16_class(TestMatMulOp13)
+create_test_bf16_class(TestMatMulOp14)
+create_test_bf16_class(TestMatMulOp15)
+create_test_bf16_class(TestMatMulOp16)
+create_test_bf16_class(TestMatMulOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
index 4eae44846efc701d90a5a4ad03c6e0e29dad77c7..1ffcb3442812dcf5a6d6357e1b87dfdfb6d6e839 100644
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -83,8 +83,8 @@ type_dict_str_to_numpy = {
 }
 
 xpu_test_op_white_list = []
-xpu_test_type_white_list = []
-xpu_test_op_type_white_list = ['float64']
+xpu_test_type_white_list = ['float64']
+xpu_test_op_type_white_list = []
 xpu_test_device_op_white_list = []
 xpu_test_device_op_type_white_list = []
 
@@ -186,7 +186,7 @@ def get_xpu_op_support_types(op_name, dev_id=0):
                 paddle.bfloat16])
         else:
             support_type_str_list.append(type_dict_paddle_to_str[stype])
-    type_white_list = get_op_type_white_list()
+    type_white_list = get_type_white_list()
     return [
         stype for stype in support_type_str_list if stype not in type_white_list
     ]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
index 5c77d6304302c982b11a1710c000dc5570e33f23..4290c0abf122ada6c8611a7738cc8ff108506fa4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
@@ -18,220 +18,79 @@ import numpy as np
 import unittest
 import sys
 sys.path.append("..")
+
+import paddle
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-from paddle.fluid import ParamAttr
-from paddle.fluid.framework import Program, grad_var_name
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestArgsortOp(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = "argsort"
-        self.place = paddle.XPUPlace(0)
-        self.init_dtype()
-        self.init_inputshape()
-        self.init_axis()
-        self.init_direction()
-
-        self.x = np.random.random(self.input_shape).astype(self.dtype)
-        self.inputs = {"X": self.x}
-        self.attrs = {"axis": self.axis, "descending": self.descending}
-        self.get_output()
-        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
-
-    def get_output(self):
-        if self.descending:
-            self.indices = np.flip(
-                np.argsort(
-                    self.x, kind='heapsort', axis=self.axis), self.axis)
-            self.sorted_x = np.flip(
-                np.sort(
-                    self.x, kind='heapsort', axis=self.axis), self.axis)
-        else:
-            self.indices = np.argsort(self.x, kind='heapsort', axis=self.axis)
-            self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-        self.__class__.no_need_check_grad = True
-
-    def init_inputshape(self):
-        self.input_shape = (2, 2, 2, 3, 3)
-
-    def init_dtype(self):
-        self.dtype = 'float32'
-
-    def init_axis(self):
-        self.axis = -1
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def init_direction(self):
-        self.descending = False
-
-
-class TestArgsortOpAxis0XPU(TestArgsortOp):
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestArgsortOpAxis1XPU(TestArgsortOp):
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestArgsortOpAxis2XPU(TestArgsortOp):
-    def init_axis(self):
-        self.axis = 2
-
-
-class TestArgsortOpAxisNeg1XPU(TestArgsortOp):
-    def init_axis(self):
-        self.axis = -1
-
-
-class TestArgsortOpAxisNeg2XPU(TestArgsortOp):
-    def init_axis(self):
-        self.axis = -2
-
-
-class TestArgsortOpDescendingAxisXPU(TestArgsortOp):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpAxis0XPUINT64(TestArgsortOp):
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = "argsort"
-        self.place = paddle.XPUPlace(0)
-        self.init_dtype()
-        self.init_inputshape()
-        self.init_axis()
-        self.init_direction()
-
-        self.x = np.random.randint(
-            low=-1000, high=1000, size=self.input_shape).astype(self.dtype)
-        self.inputs = {"X": self.x}
-        self.attrs = {"axis": self.axis, "descending": self.descending}
-        self.get_output()
-        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
-
-    def init_axis(self):
-        self.axis = 0
-
-    def init_dtype(self):
-        self.dtype = 'int64'
-
-
-class TestArgsortOpAxis1XPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestArgsortOpAxis2XPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_axis(self):
-        self.axis = 2
-
-
-class TestArgsortOpAxisNeg1XPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_axis(self):
-        self.axis = -1
-
-
-class TestArgsortOpAxisNeg2XPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_axis(self):
-        self.axis = -2
-
-
-class TestArgsortOpDescendingAxisXPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis0XPUINT64(TestArgsortOpAxis0XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis1XPUINT64(TestArgsortOpAxis1XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis2XPUINT64(TestArgsortOpAxis2XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg1XPUINT64(TestArgsortOpAxisNeg1XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg2XPUINT64(TestArgsortOpAxisNeg2XPUINT64):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpAxis0XPUINT(TestArgsortOp):
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = "argsort"
-        self.place = paddle.XPUPlace(0)
-        self.init_dtype()
-        self.init_inputshape()
-        self.init_axis()
-        self.init_direction()
-
-        self.x = np.random.randint(
-            low=-1000, high=1000, size=self.input_shape).astype(self.dtype)
-        self.inputs = {"X": self.x}
-        self.attrs = {"axis": self.axis, "descending": self.descending}
-        self.get_output()
-        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
-
-    def init_axis(self):
-        self.axis = 0
-
-    def init_dtype(self):
-        self.dtype = 'int'
-
+class XPUTestArgsortOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'argsort'
+        self.use_dynamic_create_class = True
+
+    def dynamic_create_class(self):
+        base_class = self.TestArgsortOp
+        classes = []
+        for descending in [True, False]:
+            for axis in [0, 1, 2, -1, -2]:
+                class_name = 'XPUTestArgsortOp_axis_' + str(axis) + '_' + str(
+                    descending)
+                attr_dict = {'init_axis': axis, 'init_descending': descending}
+                classes.append([class_name, attr_dict])
+        return base_class, classes
+
+    class TestArgsortOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "argsort"
+            self.place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
+            self.input_shape = (2, 2, 2, 3, 3)
+            self.axis = -1 if not hasattr(self, 'init_axis') else self.init_axis
+            self.descending = False if not hasattr(
+                self, 'init_descending') else self.init_descending
+
+            if self.dtype == np.float32:
+                self.x = np.random.random(self.input_shape).astype(self.dtype)
+            else:
+                self.x = np.random.randint(
+                    low=-1000, high=1000,
+                    size=self.input_shape).astype(self.dtype)
+
+            self.inputs = {"X": self.x}
+            self.attrs = {"axis": self.axis, "descending": self.descending}
+            self.get_output()
+            self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+        def get_output(self):
+            if self.descending:
+                self.indices = np.flip(
+                    np.argsort(
+                        self.x, kind='heapsort', axis=self.axis),
+                    self.axis)
+                self.sorted_x = np.flip(
+                    np.sort(
+                        self.x, kind='heapsort', axis=self.axis), self.axis)
+            else:
+                self.indices = np.argsort(
+                    self.x, kind='heapsort', axis=self.axis)
+                self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+support_types = get_xpu_op_support_types('argsort')
+for stype in support_types:
+    create_test_class(globals(), XPUTestArgsortOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
index 0cd98d2daea2c432032da9cb9da0b977dd29ead8..30c91f87a245274f9144be5be35e8965448c2646 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
@@ -18,11 +18,13 @@ import unittest
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
-from op_test_xpu import XPUOpTest
+
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
+
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -35,76 +37,81 @@ def huber_loss_forward(val, delta):
         return delta * (abs_val - 0.5 * delta)
 
 
-class TestHuberLossOp(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = 'huber_loss'
-        self.place = paddle.XPUPlace(0)
-
-        self.init_dtype()
-
-        self.set_inputs()
-        self.set_attrs()
-        self.set_outputs()
+class XPUTestHuberLossOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'huber_loss'
+        self.use_dynamic_create_class = False
 
-    def set_inputs(self):
-        shape = self.set_shape()
-        x = np.random.uniform(0, 1., shape).astype(self.dtype)
-        y = np.random.uniform(0, 1., shape).astype(self.dtype)
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(y)
-        }
+    class TestHuberLossOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = 'huber_loss'
+            self.place = paddle.XPUPlace(0)
 
-    def set_attrs(self):
-        self.attrs = {'delta': 0.5}
+            self.init_dtype()
+            self.set_inputs()
+            self.set_attrs()
+            self.set_outputs()
 
-    def set_outputs(self):
-        delta = self.attrs['delta']
-        shape = self.set_shape()
-        residual = self.inputs['Y'] - self.inputs['X']
-        loss = np.vectorize(huber_loss_forward)(residual,
-                                                delta).astype(self.dtype)
-        self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+        def set_inputs(self):
+            shape = self.set_shape()
+            x = np.random.uniform(0, 1., shape).astype(self.dtype)
+            y = np.random.uniform(0, 1., shape).astype(self.dtype)
+            self.inputs = {
+                'X': OpTest.np_dtype_to_fluid_dtype(x),
+                'Y': OpTest.np_dtype_to_fluid_dtype(y)
+            }
 
-    def set_shape(self):
-        return (100, 1)
+        def set_attrs(self):
+            self.attrs = {'delta': 0.5}
 
-    def set_xpu(self):
-        self.__class__.use_xpu = True
+        def set_outputs(self):
+            delta = self.attrs['delta']
+            shape = self.set_shape()
+            residual = self.inputs['Y'] - self.inputs['X']
+            loss = np.vectorize(huber_loss_forward)(residual,
+                                                    delta).astype(self.dtype)
+            self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
 
-    def init_dtype(self):
-        self.dtype = np.float32
+        def set_shape(self):
+            return (100, 1)
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+        def set_xpu(self):
+            self.__class__.use_xpu = True
 
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+        def init_dtype(self):
+            self.dtype = self.in_type
 
-    def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'], 'Out', no_grad_set=set("residual"))
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set('residual'))
+        def test_check_grad_normal(self):
+            self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
+        def test_check_grad_ingore_x(self):
+            self.check_grad_with_place(
+                self.place, ['Y'], 'Out', no_grad_set=set("residual"))
 
-def TestHuberLossOp1(TestHuberLossOp):
-    def set_shape(self):
-        return (64)
+        def test_check_grad_ingore_y(self):
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', no_grad_set=set('residual'))
 
+    class TestHuberLossOp1(TestHuberLossOp):
+        def set_shape(self):
+            return (640)
 
-def TestHuberLossOp2(TestHuberLossOp):
-    def set_shape(self):
-        return (6, 6)
+    class TestHuberLossOp2(TestHuberLossOp):
+        def set_shape(self):
+            return (10, 10)
 
+    class TestHuberLossOp3(TestHuberLossOp):
+        def set_shape(self):
+            return (10, 10, 1)
 
-def TestHuberLossOp3(TestHuberLossOp):
-    def set_shape(self):
-        return (6, 6, 1)
 
+support_types = get_xpu_op_support_types('huber_loss')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHuberLossOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 435026220c2b59a0f8df73f071673dab044e8348..45d60c8538e092f4c5d97f6525870af33a6ad9d5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -97,7 +97,7 @@ class TestMatMulV2Op(XPUOpTest):
         self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -109,7 +109,7 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -121,7 +121,7 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -133,7 +133,7 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -145,7 +145,7 @@ class TestMatMuklOp5(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -157,7 +157,7 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -169,7 +169,7 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -181,7 +181,7 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -193,7 +193,7 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -205,7 +205,7 @@ class TestMatMuklOp10(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -217,7 +217,7 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -229,7 +229,7 @@ class TestMatMuklOp12(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -241,7 +241,7 @@ class TestMatMuklOp13(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -253,7 +253,7 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -265,7 +265,7 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.trans_y = True
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the big data
     """
@@ -277,7 +277,7 @@ class TestMatMuklOp16(TestMatMulV2Op):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -289,7 +289,7 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+# class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
 #     """
@@ -300,7 +300,7 @@ class TestMatMuklOp17(TestMatMulV2Op):
 #         self.trans_x = True
 #         self.trans_y = True
 
-# class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+# class TestMatMulOpBroadcast2(TestMatMulV2Op):
 #     """
 #     case 14_4
 #     """
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
index 44137f4718743ccfe5290b0a53d7dd41312653a8..0830237d5a89d8397db129421158f143c79582fc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
@@ -14,188 +14,196 @@
 
 from __future__ import print_function
 
-import unittest
+import math
 import numpy as np
 import sys
+import unittest
 sys.path.append("..")
-import math
+
 import paddle
-from op_test import OpTest
+
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestPriorBoxOp(XPUOpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {'Input': self.input, 'Image': self.image}
-
-        self.attrs = {
-            'min_sizes': self.min_sizes,
-            'aspect_ratios': self.aspect_ratios,
-            'variances': self.variances,
-            'flip': self.flip,
-            'clip': self.clip,
-            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
-            'step_w': self.step_w,
-            'step_h': self.step_h,
-            'offset': self.offset
-        }
-        if len(self.max_sizes) > 0:
-            self.attrs['max_sizes'] = self.max_sizes
-
-        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        pass
-
-    def setUp(self):
-        self.op_type = "prior_box"
-        self.use_xpu = True
-        self.set_data()
-
-    def set_max_sizes(self):
-        max_sizes = [5, 10]
-        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
-
-    def set_min_max_aspect_ratios_order(self):
-        self.min_max_aspect_ratios_order = False
-
-    def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
-
-        self.step_w = float(self.image_w) / float(self.layer_w)
-        self.step_h = float(self.image_h) / float(self.layer_h)
-
-        self.input_channels = 2
-        self.image_channels = 3
-        self.batch_size = 10
-
-        self.min_sizes = [2, 4]
-        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.set_max_sizes()
-        self.aspect_ratios = [2.0, 3.0]
-        self.flip = True
-        self.set_min_max_aspect_ratios_order()
-        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-        self.aspect_ratios = np.array(
-            self.aspect_ratios, dtype=np.float).flatten()
-        self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
-
-        self.clip = True
-        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 0:
-            self.num_priors += len(self.max_sizes)
-        self.offset = 0.5
-
-    def init_test_input(self):
-        self.image = np.random.random(
-            (self.batch_size, self.image_channels, self.image_w,
-             self.image_h)).astype('float32')
-
-        self.input = np.random.random(
-            (self.batch_size, self.input_channels, self.layer_w,
-             self.layer_h)).astype('float32')
-
-    def init_test_output(self):
-        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
-        out_boxes = np.zeros(out_dim).astype('float32')
-        out_var = np.zeros(out_dim).astype('float32')
-
-        idx = 0
-        for h in range(self.layer_h):
-            for w in range(self.layer_w):
-                c_x = (w + self.offset) * self.step_w
-                c_y = (h + self.offset) * self.step_h
-                idx = 0
-                for s in range(len(self.min_sizes)):
-                    min_size = self.min_sizes[s]
-                    if not self.min_max_aspect_ratios_order:
-                        # rest of priors
-                        for r in range(len(self.real_aspect_ratios)):
-                            ar = self.real_aspect_ratios[r]
-                            c_w = min_size * math.sqrt(ar) / 2
-                            c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-
-                        if len(self.max_sizes) > 0:
-                            max_size = self.max_sizes[s]
-                            # second prior: aspect_ratio = 1,
-                            c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-                    else:
-                        c_w = c_h = min_size / 2.
-                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
-                                                   (c_y - c_h) / self.image_h,
-                                                   (c_x + c_w) / self.image_w,
-                                                   (c_y + c_h) / self.image_h]
-                        idx += 1
-                        if len(self.max_sizes) > 0:
-                            max_size = self.max_sizes[s]
-                            # second prior: aspect_ratio = 1,
-                            c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-
-                        # rest of priors
-                        for r in range(len(self.real_aspect_ratios)):
-                            ar = self.real_aspect_ratios[r]
-                            if abs(ar - 1.) < 1e-6:
-                                continue
-                            c_w = min_size * math.sqrt(ar) / 2
-                            c_h = (min_size / math.sqrt(ar)) / 2
+class XPUTestPriorBoxOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'prior_box'
+        self.use_dynamic_create_class = False
+
+    class TestPriorBoxOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "prior_box"
+            self.use_xpu = True
+            self.dtype = self.in_type
+            self.set_data()
+
+        def set_data(self):
+            self.init_test_params()
+            self.init_test_input()
+            self.init_test_output()
+            self.inputs = {'Input': self.input, 'Image': self.image}
+
+            self.attrs = {
+                'min_sizes': self.min_sizes,
+                'aspect_ratios': self.aspect_ratios,
+                'variances': self.variances,
+                'flip': self.flip,
+                'clip': self.clip,
+                'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+                'step_w': self.step_w,
+                'step_h': self.step_h,
+                'offset': self.offset
+            }
+            if len(self.max_sizes) > 0:
+                self.attrs['max_sizes'] = self.max_sizes
+
+            self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def set_max_sizes(self):
+            max_sizes = [5, 10]
+            self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+        def set_min_max_aspect_ratios_order(self):
+            self.min_max_aspect_ratios_order = False
+
+        def init_test_params(self):
+            self.layer_w = 32
+            self.layer_h = 32
+
+            self.image_w = 40
+            self.image_h = 40
+
+            self.step_w = float(self.image_w) / float(self.layer_w)
+            self.step_h = float(self.image_h) / float(self.layer_h)
+
+            self.input_channels = 2
+            self.image_channels = 3
+            self.batch_size = 10
+
+            self.min_sizes = [2, 4]
+            self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+            self.set_max_sizes()
+            self.aspect_ratios = [2.0, 3.0]
+            self.flip = True
+            self.set_min_max_aspect_ratios_order()
+            self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+            self.aspect_ratios = np.array(
+                self.aspect_ratios, dtype=np.float).flatten()
+            self.variances = [0.1, 0.1, 0.2, 0.2]
+            self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+            self.clip = True
+            self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+            if len(self.max_sizes) > 0:
+                self.num_priors += len(self.max_sizes)
+            self.offset = 0.5
+
+        def init_test_input(self):
+            self.image = np.random.random(
+                (self.batch_size, self.image_channels, self.image_w,
+                 self.image_h)).astype(self.dtype)
+
+            self.input = np.random.random(
+                (self.batch_size, self.input_channels, self.layer_w,
+                 self.layer_h)).astype(self.dtype)
+
+        def init_test_output(self):
+            out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+            out_boxes = np.zeros(out_dim).astype(self.dtype)
+            out_var = np.zeros(out_dim).astype(self.dtype)
+
+            idx = 0
+            for h in range(self.layer_h):
+                for w in range(self.layer_w):
+                    c_x = (w + self.offset) * self.step_w
+                    c_y = (h + self.offset) * self.step_h
+                    idx = 0
+                    for s in range(len(self.min_sizes)):
+                        min_size = self.min_sizes[s]
+                        if not self.min_max_aspect_ratios_order:
+                            # rest of priors
+                            for r in range(len(self.real_aspect_ratios)):
+                                ar = self.real_aspect_ratios[r]
+                                c_w = min_size * math.sqrt(ar) / 2
+                                c_h = (min_size / math.sqrt(ar)) / 2
+                                out_boxes[h, w, idx, :] = [
+                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                    self.image_h, (c_x + c_w) / self.image_w,
+                                    (c_y + c_h) / self.image_h
+                                ]
+                                idx += 1
+
+                            if len(self.max_sizes) > 0:
+                                max_size = self.max_sizes[s]
+                                # second prior: aspect_ratio = 1,
+                                c_w = c_h = math.sqrt(min_size * max_size) / 2
+                                out_boxes[h, w, idx, :] = [
+                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                    self.image_h, (c_x + c_w) / self.image_w,
+                                    (c_y + c_h) / self.image_h
+                                ]
+                                idx += 1
+                        else:
+                            c_w = c_h = min_size / 2.
                             out_boxes[h, w, idx, :] = [
                                 (c_x - c_w) / self.image_w, (c_y - c_h) /
                                 self.image_h, (c_x + c_w) / self.image_w,
                                 (c_y + c_h) / self.image_h
                             ]
                             idx += 1
-
-        # clip the prior's coordidate such that it is within[0, 1]
-        if self.clip:
-            out_boxes = np.clip(out_boxes, 0.0, 1.0)
-        # set the variance.
-        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
-                                           self.num_priors, 1))
-        self.out_boxes = out_boxes.astype('float32')
-        self.out_var = out_var.astype('float32')
-
-
-class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
-    def set_max_sizes(self):
-        self.max_sizes = []
-
-
-class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
-    def set_min_max_aspect_ratios_order(self):
-        self.min_max_aspect_ratios_order = True
-
+                            if len(self.max_sizes) > 0:
+                                max_size = self.max_sizes[s]
+                                # second prior: aspect_ratio = 1,
+                                c_w = c_h = math.sqrt(min_size * max_size) / 2
+                                out_boxes[h, w, idx, :] = [
+                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                    self.image_h, (c_x + c_w) / self.image_w,
+                                    (c_y + c_h) / self.image_h
+                                ]
+                                idx += 1
+
+                            # rest of priors
+                            for r in range(len(self.real_aspect_ratios)):
+                                ar = self.real_aspect_ratios[r]
+                                if abs(ar - 1.) < 1e-6:
+                                    continue
+                                c_w = min_size * math.sqrt(ar) / 2
+                                c_h = (min_size / math.sqrt(ar)) / 2
+                                out_boxes[h, w, idx, :] = [
+                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                    self.image_h, (c_x + c_w) / self.image_w,
+                                    (c_y + c_h) / self.image_h
+                                ]
+                                idx += 1
+
+            # clip the prior's coordidate such that it is within[0, 1]
+            if self.clip:
+                out_boxes = np.clip(out_boxes, 0.0, 1.0)
+            # set the variance.
+            out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
+                                               self.num_priors, 1))
+            self.out_boxes = out_boxes.astype(self.dtype)
+            self.out_var = out_var.astype(self.dtype)
+
+    class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
+        def set_max_sizes(self):
+            self.max_sizes = []
+
+    class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+        def set_min_max_aspect_ratios_order(self):
+            self.min_max_aspect_ratios_order = True
+
+
+support_types = get_xpu_op_support_types('prior_box')
+for stype in support_types:
+    create_test_class(globals(), XPUTestPriorBoxOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index a0503322806e5825ca720740e93c07ecf6cb51fb..72e8e73ce7c2e51b9f7d1e38dba1098149ffcf89 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
 from ..fluid.core import MLUPlace  # noqa: F401
+from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 
 from paddle.fluid import core  # noqa: F401
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 711fd1e94cae9eff403de685f152d05a8fb52a31..8dc040325934f42eca30960fcd70abdfe87a11c9 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1676,7 +1676,7 @@ def cross_entropy(input,
             if label_max >= input.shape[axis]:
                 raise ValueError("label should not out of bound, but got{}".
                                  format(label_max))
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
                 ignore_index, 'numeric_stable_mode', True, 'axis', axis,
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 79bacc0dfb6a7e714b292ded6f99889a43a3690b..9d55b8d1d2f12ac9a83cac33de014462173987e5 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1358,7 +1358,7 @@ class ReduceOnPlateau(LRScheduler):
             self.last_epoch = epoch
 
         if _in_eager_mode():
-            tmp = core.eager.EagerTensor
+            tmp = core.eager.Tensor
         else:
             tmp = Tensor
         # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index cd1faf64f3ea5cdddadcaa85cd68520b255d1db4..dd56b391d10ff8dc47abaa0dc963b49d4e7961a9 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     if place is None:
         place = _current_expected_place()
     elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
-                                core.CUDAPlace, core.NPUPlace, core.XPUPlace)):
+                                core.CUDAPlace, core.NPUPlace, core.XPUPlace,
+                                core.CustomPlace)):
         raise ValueError(
-            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace"
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
         )
 
     #Todo(zhouwei): Support allocate tensor on any other specified card
@@ -168,8 +169,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     # TOOD(jiabin): Support kwargs in eager tensor constructor
     if _in_eager_mode() and isinstance(data, np.ndarray):
-        return core.eager.EagerTensor(data, place, False, False, None,
-                                      stop_gradient)
+        return core.eager.Tensor(data, place, False, False, None, stop_gradient)
     else:
         return paddle.Tensor(
             value=data,
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index af0f33f97ab4f59e79ce4d247d0e648147613283..0e76d92ca73ef35ede331d19683cbd6e22013141 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -263,7 +263,7 @@ def to_string(var, prefix='Tensor'):
         data=data)
 
 
-def eager_tensor_to_string(tensor, prefix='Tensor'):
+def tensor_to_string(tensor, prefix='Tensor'):
     indent = len(prefix) + 1
 
     _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
diff --git a/python/paddle/tests/hapi_mnist_bf16_static.py b/python/paddle/tests/hapi_mnist_bf16_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eb4d61a21ee12c5357e8af911017b1523d78dba
--- /dev/null
+++ b/python/paddle/tests/hapi_mnist_bf16_static.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+
+from paddle import Model, set_device
+from paddle.static import InputSpec as Input
+from paddle.metric import Accuracy
+from paddle.vision.datasets import MNIST
+from paddle.vision.models import LeNet
+import paddle.static.amp as amp
+import random
+from paddle import callbacks
+import argparse
+import ast
+
+SEED = 2
+paddle.seed(SEED)
+paddle.framework.random._manual_program_seed(SEED)
+np.random.seed(SEED)
+random.seed(SEED)
+
+paddle.enable_static()
+set_device('cpu')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Lenet BF16 train static script")
+    parser.add_argument(
+        '-bf16',
+        '--bf16',
+        type=ast.literal_eval,
+        default=False,
+        help="whether use bf16")
+    args = parser.parse_args()
+    return args
+
+
+class MnistDataset(MNIST):
+    def __init__(self, mode, return_label=True):
+        super(MnistDataset, self).__init__(mode=mode)
+        self.return_label = return_label
+
+    def __getitem__(self, idx):
+        img = np.reshape(self.images[idx], [1, 28, 28])
+        if self.return_label:
+            return img, np.array(self.labels[idx]).astype('int64')
+        return img,
+
+    def __len__(self):
+        return len(self.images)
+
+
+def compute_accuracy(pred, gt):
+    pred = np.argmax(pred, -1)
+    gt = np.array(gt)
+
+    correct = pred[:, np.newaxis] == gt
+
+    return np.sum(correct) / correct.shape[0]
+
+
+def main(args):
+    print('download training data and load training data')
+    train_dataset = MnistDataset(mode='train', )
+    val_dataset = MnistDataset(mode='test', )
+    test_dataset = MnistDataset(mode='test', return_label=False)
+
+    im_shape = (-1, 1, 28, 28)
+    batch_size = 64
+
+    inputs = [Input(im_shape, 'float32', 'image')]
+    labels = [Input([None, 1], 'int64', 'label')]
+
+    model = Model(LeNet(), inputs, labels)
+    optim = paddle.optimizer.SGD(learning_rate=0.001)
+    if args.bf16:
+        optim = amp.bf16.decorate_bf16(
+            optim,
+            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_bf16_list={
+                    'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add',
+                    'reshape2', 'slice', 'reduce_mean', 'conv2d'
+                }, ))
+
+    # Configuration model
+    model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy())
+    # Training model #
+    if args.bf16:
+        print('Training BF16')
+    else:
+        print('Training FP32')
+    model.fit(train_dataset, epochs=2, batch_size=batch_size, verbose=1)
+    eval_result = model.evaluate(val_dataset, batch_size=batch_size, verbose=1)
+
+    output = model.predict(
+        test_dataset, batch_size=batch_size, stack_outputs=True)
+
+    np.testing.assert_equal(output[0].shape[0], len(test_dataset))
+
+    acc = compute_accuracy(output[0], val_dataset.labels)
+
+    print("acc", acc)
+    print("eval_result['acc']", eval_result['acc'])
+
+    np.testing.assert_allclose(acc, eval_result['acc'])
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 8b8b001739f3f8d652b6814c135225ec76f2743f..66411d00f1517c18c7eea820980305e5a70ec2e8 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1,5 +1,5 @@
 - api : add
-  args : (const Tensor& x, const Tensor& y)
+  args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
@@ -7,7 +7,7 @@
     func : add
 
 - api : cast
-  args : (const Tensor& x, DataType out_dtype)
+  args : (Tensor x, DataType out_dtype)
   output : Tensor
   infer_meta :
     func : CastInferMeta
@@ -18,7 +18,7 @@
 
 
 - api : concat
-  args : (const std::vector<Tensor>& x, const Scalar& axis)
+  args : (Tensor[] x, Scalar axis)
   output : Tensor
   infer_meta :
     func : ConcatInferMeta
@@ -27,7 +27,7 @@
     func : concat
 
 - api : conj
-  args : (const Tensor& x)
+  args : (Tensor x)
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
@@ -35,7 +35,7 @@
     func : conj
 
 - api : divide
-  args : (const Tensor& x, const Tensor& y)
+  args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
@@ -43,7 +43,7 @@
     func : divide
 
 - api : dot
-  args : (const Tensor& x, const Tensor& y)
+  args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
     func : DotInferMeta
@@ -51,7 +51,7 @@
     func : dot
 
 - api : empty
-  args : (const ScalarArray& shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -64,7 +64,7 @@
     layout : layout
 
 - api : empty_like
-  args : (const Tensor& x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -77,7 +77,7 @@
     layout : layout > x
 
 - api : flatten
-  args : (const Tensor& x, int start_axis, int stop_axis)
+  args : (Tensor x, int start_axis, int stop_axis)
   output : Tensor
   infer_meta :
     func : FlattenInferMeta
@@ -85,7 +85,7 @@
     func : flatten
 
 - api : full
-  args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -98,7 +98,7 @@
     layout : layout
 
 - api : full_like
-  args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -111,7 +111,7 @@
     layout : layout > x
 
 - api : matmul
-  args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
+  args : (Tensor x, Tensor y, bool transpose_x = false, bool transpose_y = false)
   output : Tensor
   infer_meta :
     func : MatmulInferMeta
@@ -120,7 +120,7 @@
   backward : matmul_grad
 
 - api : mean
-  args : (const Tensor& x, const std::vector<int64_t>& axis={}, bool keep_dim=false)
+  args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : ReduceInferMeta
@@ -128,7 +128,7 @@
     func : mean
 
 - api : multiply
-  args : (const Tensor& x, const Tensor& y)
+  args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
@@ -136,12 +136,12 @@
     func : multiply
 
 - api : ones_like
-  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 1, dtype, place, layout)
 
 - api : reshape
-  args : (const Tensor& x, const ScalarArray& shape)
+  args : (Tensor x, ScalarArray shape)
   output : Tensor
   infer_meta :
     func : ReshapeInferMeta
@@ -149,7 +149,7 @@
     func : reshape
 
 - api : scale
-  args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
+  args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
@@ -158,7 +158,7 @@
     func : scale, scale_sr
 
 - api : sign
-  args : (const Tensor& x)
+  args : (Tensor x)
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
@@ -166,7 +166,7 @@
     func : sign
 
 - api : subtract
-  args : (const Tensor& x, const Tensor& y)
+  args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
@@ -174,7 +174,7 @@
     func : subtract
 
 - api : sum
-  args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : SumInferMeta
@@ -184,6 +184,6 @@
     data_type : x
 
 - api : zeros_like
-  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 0, dtype, place, layout)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 2e1ed58e1c40b6794ed83b6e07183099ad04f00b..73c3ba4e4b4fe9d56ac6e6c7638777fc5df89164 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -71,23 +71,26 @@ class BaseAPI(object):
             f"Args declaration should start with '(' and end with ')', please check the args of {api_name} in yaml."
         args_str = args_str[1:-1]
         args_list = args_str.split(',')
-        input_types = [
-            'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
-            'const std::vector<Tensor> &'
-        ]
-        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
-                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
-                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+        input_types_map = {
+            'Tensor': 'const Tensor&',
+            'Tensor[]': 'const std::vector<Tensor>&'
+        }
+        attr_types_map = {'ScalarArray' : 'const ScalarArray&', 'Scalar' : 'const Scalar&', \
+                      'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
+                      'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+                      'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
+                      'int64_t[]' : 'const std::vector<int64_t>&', 'int[]' : 'const std::vector<int>&'}
         args_declare_str = ""
         args_define_str = ""
 
         for item in args_list:
             item = item.strip()
+            type_and_name = item.split(' ')
             # match the input tensor
             has_input = False
-            for in_type in input_types:
-                if item.startswith(in_type):
-                    input_name = item[len(in_type):].strip()
+            for in_type_symbol, in_type in input_types_map.items():
+                if type_and_name[0] == in_type_symbol:
+                    input_name = type_and_name[1].strip()
                     assert len(input_name) > 0, \
                         f"The input tensor name should not be empty. Please check the args of {api_name} in yaml."
                     assert len(attrs['names']) == 0, \
@@ -103,9 +106,9 @@ class BaseAPI(object):
                 continue
 
             # match the attribute
-            for attr_type in attr_types:
-                if item.startswith(attr_type):
-                    attr_name = item[len(attr_type):].strip()
+            for attr_type_symbol, attr_type in attr_types_map.items():
+                if type_and_name[0] == attr_type_symbol:
+                    attr_name = item[len(attr_type_symbol):].strip()
                     assert len(attr_name) > 0, \
                         f"The attribute name should not be empty. Please check the args of {api_name} in yaml."
                     default_value = None
@@ -128,25 +131,28 @@ class BaseAPI(object):
 
     def parse_output(self, api_name, output_config):
         def parse_output_item(output_item):
-            alllowd_output_types = ['Tensor', 'std::vector<Tensor>']
+            output_type_map = {
+                'Tensor': 'Tensor',
+                'Tensor[]': 'std::vector<Tensor>'
+            }
             if re.search(r'\(\w*\)', output_item):
                 result = re.search(
-                    r"(?P<out_type>[a-zA-Z0-9_<>]+)\s*\((?P<name>\w+)\)",
+                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
                     output_item)
                 out_type = result.group('out_type')
-                assert out_type in alllowd_output_types, \
-                    f"{api_name} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
+                assert out_type in output_type_map, \
+                    f"{api_name} : Output type error: the output type only support Tensor and Tensor[], \
                       but now is {out_type}."
 
                 return out_type, result.group('name')
 
             else:
-                if output_item.strip() in alllowd_output_types:
-                    return output_item.strip(), 'out'
+                if output_item.strip() in output_type_map:
+                    return output_type_map[output_item.strip()], 'out'
                 else:
                     raise ValueError(
-                        "{} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
-                      but now is {}.".format(api_name, out_type))
+                        "{} : Output type error: the output type only support Tensor and Tensor[], \
+                      but now is {}.".format(api_name, output_item.strip()))
 
         temp_list = output_config.split(',')
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index d14cf11c8dd7eaea2482e7a043c76530fc6fc7d7..62b724432e9283613f69852ec04eda55a88b0ab2 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,33 +1,34 @@
 - backward_api : matmul_grad
-  forward : matmul (const Tensor& x, const Tensor& y, bool transpose_x=false, bool transpose_y=false) -> Tensor(out)
-  args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
+  forward : matmul (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
-    func : MatmulGradInferMeta
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : matmul_grad
 
 - backward_api : scale_grad
-  forward : scale (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale) -> Tensor(out)
-  args : (const Tensor& out_grad, const Scalar& scale, float bias=0.0, bool bias_after_scale=true)
+  forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
+  args : (Tensor out_grad, Scalar scale, float bias=0.0, bool bias_after_scale=true)
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 #
 # - backward_api : matmul_double_grad
-#   forward : matmul_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor>(dx, dy)
-#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y)
-#   output : tuple<Tensor, Tensor, Tensor>  // d2x, d2y, dout_grad
+#   forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor>(dy)
+#   args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y)
+#   output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
 #   infer_meta :
 #     func : MatmulDoubleGradInferMeta
 #   kernel :
 #     func : matmul_double_grad
 
 # - backward_api : matmul_triple_grad
-#   forward : matmul_double_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor, Tensor>(d2x, d2y, dout_grad)
-#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, const Tensor& d2x_grad, const Tensor& d2y_grad, const Tensor& dout_grad_grad, bool transpose_x, bool transpose_y)
-#   output : tuple<Tensor, Tensor, Tensor, Tensor, Tensor>  // d3x, d3y, d2out_grad, ddx_grad, ddy_grad
+#   forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
+#   args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, Tensor d2x_grad, Tensor d2y_grad, Tensor dout_grad_grad, bool transpose_x, bool transpose_y)
+#   output : Tensor(d3x), Tensor(d3y), Tensor(d2out_grad), Tensor(ddx_grad), Tensor(ddy_grad)
 #   infer_meta :
 #     func : MatmulTripleGradInferMeta
 #   kernel :
diff --git a/python/setup.py.in b/python/setup.py.in
index 8f42beaf1c09b5e9d23946fb6436151590868072..9977ddeb26b17f6e69dbd49b782ff50490ab55a5 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -579,7 +579,8 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) +  # pten core headers
     # utila api headers
     ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -624,6 +625,8 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'device_ext.h' in header:
+                install_dir = "paddle/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 4fd4e809cacbec0e65223a200f56b26c7f34e6b6..a36f173454f6a57bc9407b7b56042de5a14e32a7 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -262,15 +262,17 @@ if [ "${PTEN_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/pten files(${PTEN_INCLUDE_FLUID_FILES}).\n"
     check_approval 1 chenwhql MingMingShangTian YuanRisheng zyfncg
 fi
+
+HAS_MODIFIED_PTEN_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/pten/kernels" || true`
 PTEN_USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_FILES}; do
+for CHANGE_FILE in ${HAS_MODIFIED_PTEN_KERNEL_FILES}; do
     PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
     if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
         PTEN_USE_MUTABLE_DATA_FILES="${PTEN_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
     fi 
 done
 if [ "${PTEN_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/pten files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use pten::DeviceContext::Alloc() or pten::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
+    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/pten/kernels files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use pten::DeviceContext::Alloc() or pten::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
     check_approval 1 chenwhql Shixiaowei02 MingMingShangTian YuanRisheng zyfncg
 fi
   
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index b77f3eb00ff06eedbf6fa14d8dbb1078d2375770..f1221f058bc6dda17d7c25a0e4c74d47143c23e9 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -48,6 +48,7 @@ function gen_full_html_report() {
         '/paddle/paddle/fluid/operators/*' \
         '/paddle/paddle/fluid/recordio/*' \
         '/paddle/paddle/fluid/string/*' \
+        '/paddle/paddle/pten/*' \
         -o coverage-full.tmp \
         --rc lcov_branch_coverage=0
 
@@ -59,6 +60,7 @@ function gen_full_html_report() {
         '/paddle/paddle/fluid/*/*/*test*' \
         '/paddle/paddle/fluid/inference/tests/*' \
         '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        '/paddle/paddle/pten/tests/*' \
         -o coverage-full.tmp \
         --rc lcov_branch_coverage=0
 
diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_pten_kernel_function.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0d787d9930b2c739733e8431eaccece88519248a
--- /dev/null
+++ b/tools/infrt/get_pten_kernel_function.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#=================================================
+#                   Utils
+#=================================================
+
+set -e
+
+#step 1:get kernel registered info
+kernel_register_info_file=`mktemp`
+PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/pten/kernels -name "*.c*" \
+   | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
+   | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' \
+   | grep PT_REGISTER \
+   | awk -F ",|\(" '{gsub(/ /,"");print $2, $3, $4, $5}' \
+   |  sort -u | awk '{gsub(/pten::/,"");print $0}' \
+   | grep -v "_grad" > $kernel_register_info_file
+
+#step 2:get simple general inferMeta function wrap info
+temp_path=`mktemp -d`
+python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
+  --api_yaml_path ${PADDLE_ROOT}/python/paddle/utils/code_gen/api.yaml \
+  --wrapped_infermeta_header_path ${temp_path}/generate.h \
+  --wrapped_infermeta_source_path ${temp_path}/generate.cc
+
+grep PT_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
+  | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
+
+#step 3: merge all infos
+#  @input1 => pten kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
+#  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
+#  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
+#             same signature with kernel function
+python3 ${PADDLE_ROOT}/tools/infrt/get_pten_kernel_info.py \
+  --paddle_root_path ${PADDLE_ROOT} \
+  --kernel_info_file $kernel_register_info_file \
+  --infermeta_wrap_file ${temp_path}/wrap_info.txt
diff --git a/paddle/scripts/get_pten_kernel_info.py b/tools/infrt/get_pten_kernel_info.py
similarity index 73%
rename from paddle/scripts/get_pten_kernel_info.py
rename to tools/infrt/get_pten_kernel_info.py
index 5575fac41fe3d30f7d2119eaece023f5da25fef1..e311464130008e9c7815c028f69b2d29eef3b349 100644
--- a/paddle/scripts/get_pten_kernel_info.py
+++ b/tools/infrt/get_pten_kernel_info.py
@@ -31,6 +31,11 @@ def parse_args():
         type=str,
         required=True,
         help="kernel info file generated by get_pten_kernel_function.sh .")
+    parser.add_argument(
+        "--infermeta_wrap_file",
+        type=str,
+        required=True,
+        help="inferMeta wrap info file .")
     args = parser.parse_args()
     return args
 
@@ -47,17 +52,24 @@ def get_kernel_info(file_path):
     return [l.strip() for l in cont]
 
 
-def merge(infer_meta_data, kernel_data):
+def merge(infer_meta_data, kernel_data, wrap_data):
     meta_map = {}
     for api in infer_meta_data:
-        if not api.has_key("kernel") or not api.has_key("infer_meta"):
+        if "kernel" not in api or "infer_meta" not in api:
             continue
         meta_map[api["kernel"]["func"]] = api["infer_meta"]["func"]
+    wrap_map = {}
+    for l in wrap_data:
+        wrap_map[l.split()[0]] = l.split()[1]
+
     full_kernel_data = []
     for l in kernel_data:
         key = l.split()[0]
-        if meta_map.has_key(key):
-            full_kernel_data.append((l + " " + meta_map[key]).split())
+        if key in meta_map:
+            if key in meta_map:
+                full_kernel_data.append((l + " " + wrap_map[key]).split())
+            else:
+                full_kernel_data.append((l + " " + meta_map[key]).split())
         else:
             full_kernel_data.append((l + " unknown").split())
 
@@ -68,5 +80,6 @@ if __name__ == "__main__":
     args = parse_args()
     infer_meta_data = get_api_yaml_info(args.paddle_root_path)
     kernel_data = get_kernel_info(args.kernel_info_file)
-    out = merge(infer_meta_data, kernel_data)
+    info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
+    out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
     print(json.dumps(out))